From 779d58053a5be986d80fd5c53afe235c4a5c8102 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 4 Aug 2022 10:31:46 +0300 Subject: [PATCH 001/119] Port some configs from DSpace 6 --- dspace/config/dspace.cfg | 42 ++++++++++++------- dspace/config/modules/authentication-ldap.cfg | 21 +++++----- dspace/config/modules/solr-statistics.cfg | 2 +- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index b7cc13e508dc..0e34e2a3cccb 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -198,6 +198,8 @@ mail.charset = UTF-8 # mail.smtp.socketFactory.class=javax.net.ssl.SSLSocketFactory, \ # mail.smtp.socketFactory.fallback=false +mail.extraproperties = mail.smtp.starttls.enable=true, mail.smtp.ssl.protocols=TLSv1.2 + # An option is added to disable the mailserver. By default, this property is set to false # By setting mail.server.disabled = true, DSpace will not send out emails. # It will instead log the subject of the email which should have been sent @@ -311,7 +313,7 @@ handle.canonical.prefix = ${dspace.ui.url}/handle/ # CNRI Handle prefix # (Defaults to a dummy/fake prefix of 123456789) -handle.prefix = 123456789 +handle.prefix = 10568 # Directory for installing Handle server files handle.dir = ${dspace.dir}/handle-server @@ -319,7 +321,10 @@ handle.dir = ${dspace.dir}/handle-server # List any additional prefixes that need to be managed by this handle server # (as for examle handle prefix coming from old dspace repository merged in # that repository) -# handle.additional.prefixes = prefix1[, prefix2] +handle.additional.prefixes = 10947 + +# Allow DSpace to resolve handles from multiple prefixes (see HandlePlugin.java) +handle.plugin.checknameauthority = false # Whether to enable the DSpace handle resolver endpoints necessary for # https://github.com/DSpace/Remote-Handle-Resolver @@ -445,8 +450,8 @@ useProxies = true #Names of the enabled MediaFilter or FormatFilter plugins filter.plugins = Text Extractor -filter.plugins = JPEG Thumbnail -filter.plugins = PDFBox JPEG Thumbnail +filter.plugins = ImageMagick Image Thumbnail +filter.plugins = ImageMagick PDF Thumbnail # [To enable Branded Preview]: uncomment and insert the following into the plugin list @@ -970,7 +975,7 @@ metadata.hide.person.email = true # Whether or not we REQUIRE that a file be uploaded # during the 'Upload' step in the submission process # Defaults to true; If set to 'false', submitter has option to skip upload -#webui.submit.upload.required = true +webui.submit.upload.required = false # Which field should be used for type-bind # Defaults to 'dc.type'; If changing this value, you must also update the related @@ -1023,8 +1028,8 @@ cc.license.locale = en # Maximum width and height (in pixels) of generated thumbnails # NOTE: In the UI's base theme, `--ds-thumbnail-max-width` defaults to 175px. # So, if you set 'thumbnail.maxwidth' >175, you may wish to modify that UI style variable as well. -thumbnail.maxwidth = 175 -thumbnail.maxheight = 175 +thumbnail.maxwidth = 300 +thumbnail.maxheight = 300 # Blur before scaling. A little blur before scaling does wonders for keeping # more in check. (Only used by JPEGFilter) @@ -1156,8 +1161,16 @@ webui.preview.brand.fontpoint = 12 webui.browse.index.1 = dateissued:item:dateissued webui.browse.index.2 = author:metadata:dc.contributor.*\,dc.creator:text webui.browse.index.3 = title:item:title -webui.browse.index.4 = subject:metadata:dc.subject.*:text -#webui.browse.index.5 = dateaccessioned:item:dateaccessioned +webui.browse.index.4 = subject:metadata:dc.subject.*\,dcterms.subject:text +webui.browse.index.5 = ilrisubject:metadata:cg.subject.ilri:text +webui.browse.index.6 = region:metadata:cg.coverage.region:text +webui.browse.index.7 = country:metadata:cg.coverage.country:text +webui.browse.index.8 = subregion:metadata:cg.coverage.subregion:text +webui.browse.index.9 = basin:metadata:cg.river.basin:text +webui.browse.index.10 = type:metadata:dcterms.type.*:text +webui.browse.index.11 = cipsubject:metadata:cg.subject.cip:text +webui.browse.index.12 = systemsubject:metadata:cg.subject.system:text +webui.browse.index.13 = alliancebiovciatsubject:metadata:cg.subject.alliancebiovciat:text ## example of authority-controlled browse category - see authority control config #webui.browse.index.5 = lcAuthor:metadataAuthority:dc.contributor.author:authority @@ -1198,8 +1211,9 @@ webui.browse.index.4 = subject:metadata:dc.subject.*:text # but otherwise don't want users to choose that option. # webui.itemlist.sort-option.1 = title:dc.title:title -webui.itemlist.sort-option.2 = dateissued:dc.date.issued:date +webui.itemlist.sort-option.2 = dateissued:dcterms.issued:date webui.itemlist.sort-option.3 = dateaccessioned:dc.date.accessioned:date +webui.itemlist.sort-option.4 = type:dcterms.type:text # Set the options for how the indexes are sorted # @@ -1353,7 +1367,7 @@ websvc.opensearch.autolink = true websvc.opensearch.validity = 48 # short name used in browsers for search service # should be 16 or fewer characters -websvc.opensearch.shortname = DSpace +websvc.opensearch.shortname = CGSpace # longer (up to 48 characters) name websvc.opensearch.longname = ${dspace.name} # brief service description @@ -1464,7 +1478,7 @@ log.report.dir = ${dspace.dir}/log # You can add more than one 'mark_[value]' options (with different value) in case you need to mark items more than one time for # different purposes. Remember to add the respective beans in file 'config/spring/api/item-marking.xml'. # -# webui.itemlist.columns = dc.date.issued(date), dc.title, dc.contributor.* +webui.itemlist.columns = dcterms.accessRights,dcterms.issued(date),dcterms.type,dc.title,dc.contributor.* # # Additionally, you can override the DC fields used on the listing page for # a given browse index and/or sort option. As a sort option or index may be defined @@ -1532,7 +1546,7 @@ google-analytics.bundles = ORIGINAL # all - Anonymous users can request an item # logged - Login is mandatory to request an item # empty/commented out - request-copy not allowed -request.item.type = all +#request.item.type = all # Should all Request Copy emails go to the helpdesk instead of the item submitter? request.item.helpdesk.override = false # Should a rejection of a copy request send an email back to the requester? @@ -1544,7 +1558,7 @@ request.item.reject.email = true #------------------SUBMISSION CONFIGURATION------------------------# #------------------------------------------------------------------# # Field to use for type binding, default dc.type -submit.type-bind.field = dc.type +submit.type-bind.field = dcterms.type #---------------------------------------------------------------# #----------SOLR DATABASE RESYNC SCRIPT CONFIGURATION------------# diff --git a/dspace/config/modules/authentication-ldap.cfg b/dspace/config/modules/authentication-ldap.cfg index bcc29ccac551..422f2f57e132 100644 --- a/dspace/config/modules/authentication-ldap.cfg +++ b/dspace/config/modules/authentication-ldap.cfg @@ -51,7 +51,7 @@ authentication-ldap.autoregister = true # This is the unique identifier field in the LDAP directory # where the username is stored. -#authentication-ldap.id_field = uid +authentication-ldap.id_field = sAMAccountName # This is the object context used when authenticating the # user. It is appended to the id_field and username. @@ -72,31 +72,31 @@ authentication-ldap.autoregister = true # parameter. But again this depends on each individual LDAP server # configuration. # Note: Prepend commas with a backslash to escape them -#authentication-ldap.search_context = ou=people\,ou=faculties +authentication-ldap.search_context = dc=cgiarad\,dc=org # This is the LDAP object field where the user's email address # is stored. "mail" is the default and the most common for # LDAP servers. If the mail field is not found the username # will be used as the email address when creating the eperson # object. -#authentication-ldap.email_field = mail +authentication-ldap.email_field = mail # This is the LDAP object field where the user's last name is # stored. "sn" is the default and is the most common for LDAP # servers. If the field is not found the field will be left # blank in the new eperson object. -#authentication-ldap.surname_field = sn +authentication-ldap.surname_field = sn # This is the LDAP object field where the user's given names # are stored. This may not be used or set in all LDAP instances. # If the field is not found the field will be left blank in the # new eperson object. -#authentication-ldap.givenname_field = givenName +authentication-ldap.givenname_field = givenName # This is the field where the user's phone number is stored in # the LDAP directory. If the field is not found the field # will be left blank in the new eperson object. -#authentication-ldap.phone_field = telephoneNumber +authentication-ldap.phone_field = telephoneNumber ##### LDAP users group ##### @@ -104,8 +104,7 @@ authentication-ldap.autoregister = true # If required, a group name can be given here, and all users who log in # to LDAP will automatically become members of this group. This is useful # if you want a group made up of all internal authenticated users. -#authentication-ldap.login.specialgroup = group-name - +authentication-ldap.login.specialgroup = CGIAR_LDAP_USERS ##### Hierarchical LDAP Settings ##### @@ -128,7 +127,7 @@ authentication-ldap.autoregister = true # object scope : 0 # one level scope : 1 # subtree scope : 2 -#authentication-ldap.search_scope = 2 +authentication-ldap.search_scope = 2 # If true, the initial bind will be performed anonymously. #authentication-ldap.search.anonymous = false @@ -150,8 +149,8 @@ authentication-ldap.autoregister = true # in user's full DN. If it's found, assign user to the DSpace group # specified by the right part of the groupmap value (after the ":"). # One user may belong to multiple groups. -#authentication-ldap.login.groupmap.1 = ou=ldap-dept1:dspace-group1 -#authentication-ldap.login.groupmap.2 = ou=ldap-dept2:dspace-groupA +authentication-ldap.login.groupmap.1 = DC=ILRI:ILRI_LDAP_USERS +authentication-ldap.login.groupmap.2 = OU=ILRIHUB:ILRI_LDAP_USERS #authentication-ldap.login.groupmap.3 = ou=ldap-dept3:dspace-groupA # If this property is uncommented, it changes the meaning of the left part of diff --git a/dspace/config/modules/solr-statistics.cfg b/dspace/config/modules/solr-statistics.cfg index 073850ca232e..f57132741a21 100644 --- a/dspace/config/modules/solr-statistics.cfg +++ b/dspace/config/modules/solr-statistics.cfg @@ -25,7 +25,7 @@ solr-statistics.configset = statistics # control solr statistics querying to look at "isBot" field to determine # if record is a bot. true by default. -#solr-statistics.query.filter.isBot = true +solr-statistics.query.filter.isBot = true # Whether or not explicit solr.commit can be done in SolrLoggerServiceImpl#postView, or to be left to the autocommit. # Defaults to true (i.e. via autoCommit, no explicit commits); set to false in statistics tests (e.g. StatisticsRestRepositoryIT) From 5583ce91b8222c4695234f4ddbed2c64cf2901d5 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 4 Aug 2022 11:04:10 +0300 Subject: [PATCH 002/119] dspace/config: enable localhost:4000 for REST API rest.cors.allowed-origins allows us to develop on Angular locally while using a remote DSpace API. We will perhaps disable this in the long run after we deploy DSpace 7. --- dspace/config/modules/rest.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/modules/rest.cfg b/dspace/config/modules/rest.cfg index 537eedbd087b..d62565da8dc4 100644 --- a/dspace/config/modules/rest.cfg +++ b/dspace/config/modules/rest.cfg @@ -8,7 +8,7 @@ # Defaults to ${dspace.ui.url} if unspecified (as the UI must have access to the REST API). # Multiple allowed origin URLs may be comma separated. Wildcard value (*) is NOT SUPPORTED. # (Requires reboot of servlet container, e.g. Tomcat, to reload) -rest.cors.allowed-origins = ${dspace.ui.url} +rest.cors.allowed-origins = ${dspace.ui.url}, http://localhost:4000 # Whether or not to allow credentials (e.g. cookies) sent by the client/browser in CORS # requests (in "Access-Control-Allow-Credentials" header). From ca9ccf8f2c2d4936e0048adc820c7bde29d9f09f Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 4 Aug 2022 11:13:56 +0300 Subject: [PATCH 003/119] dspace.cfg: disable extra "browse by" I ported these from DSpace 6 but they are aesthetically annoying in DSpace 7 when you have more than ~4. --- dspace/config/dspace.cfg | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 0e34e2a3cccb..05615075d31a 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1162,15 +1162,6 @@ webui.browse.index.1 = dateissued:item:dateissued webui.browse.index.2 = author:metadata:dc.contributor.*\,dc.creator:text webui.browse.index.3 = title:item:title webui.browse.index.4 = subject:metadata:dc.subject.*\,dcterms.subject:text -webui.browse.index.5 = ilrisubject:metadata:cg.subject.ilri:text -webui.browse.index.6 = region:metadata:cg.coverage.region:text -webui.browse.index.7 = country:metadata:cg.coverage.country:text -webui.browse.index.8 = subregion:metadata:cg.coverage.subregion:text -webui.browse.index.9 = basin:metadata:cg.river.basin:text -webui.browse.index.10 = type:metadata:dcterms.type.*:text -webui.browse.index.11 = cipsubject:metadata:cg.subject.cip:text -webui.browse.index.12 = systemsubject:metadata:cg.subject.system:text -webui.browse.index.13 = alliancebiovciatsubject:metadata:cg.subject.alliancebiovciat:text ## example of authority-controlled browse category - see authority control config #webui.browse.index.5 = lcAuthor:metadataAuthority:dc.contributor.author:authority From 007147b088f67ab945b06dfffa3234fc1bb0d532 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 5 Aug 2022 15:57:04 +0300 Subject: [PATCH 004/119] dspace: add controlled vocabularies These were copied directly from our DSpace 6 branch. --- .../cg-contributor-affiliation.xml | 1005 ++++++++ .../cg-contributor-donor.xml | 788 +++++++ .../cg-coverage-subregion.xml | 87 + .../cg-creator-identifier.xml | 1408 ++++++++++++ .../controlled-vocabularies/cg-journal.xml | 2028 +++++++++++++++++ .../cg-species-breed.xml | 885 +++++++ .../dc-contributor-author.xml | 1506 ++++++++++++ .../dcterms-subject.xml | 1414 ++++++++++++ 8 files changed, 9121 insertions(+) create mode 100644 dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml create mode 100644 dspace/config/controlled-vocabularies/cg-contributor-donor.xml create mode 100644 dspace/config/controlled-vocabularies/cg-coverage-subregion.xml create mode 100644 dspace/config/controlled-vocabularies/cg-creator-identifier.xml create mode 100644 dspace/config/controlled-vocabularies/cg-journal.xml create mode 100644 dspace/config/controlled-vocabularies/cg-species-breed.xml create mode 100644 dspace/config/controlled-vocabularies/dc-contributor-author.xml create mode 100644 dspace/config/controlled-vocabularies/dcterms-subject.xml diff --git a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml new file mode 100644 index 000000000000..78226ca3de78 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xmldiff --git a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml new file mode 100644 index 000000000000..25c960c6fb42 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-contributor-donor.xmldiff --git a/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml b/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml new file mode 100644 index 000000000000..e1798618054f --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml @@ -0,0 +1,87 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml new file mode 100644 index 000000000000..48c7634d841e --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xmldiff --git a/dspace/config/controlled-vocabularies/cg-journal.xml b/dspace/config/controlled-vocabularies/cg-journal.xml new file mode 100644 index 000000000000..80b76d1fb46e --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-journal.xmldiff --git a/dspace/config/controlled-vocabularies/cg-species-breed.xml b/dspace/config/controlled-vocabularies/cg-species-breed.xml new file mode 100644 index 000000000000..1095562b8dcf --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-species-breed.xmldiff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml new file mode 100644 index 000000000000..647ed6dc357b --- /dev/null +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xmldiff --git a/dspace/config/controlled-vocabularies/dcterms-subject.xml b/dspace/config/controlled-vocabularies/dcterms-subject.xml new file mode 100644 index 000000000000..73fab2b5a98e --- /dev/null +++ b/dspace/config/controlled-vocabularies/dcterms-subject.xmlrom bcc4031fb6827d9268d885a4af0c98cb23bd9dbd Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 5 Aug 2022 21:11:05 +0300 Subject: [PATCH 005/119] dspace/config: import latest spider lists These are the latest spider agent lists from our DSpace 6.x branch. --- dspace/config/spiders/agents/ilri | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 dspace/config/spiders/agents/ilri diff --git a/dspace/config/spiders/agents/ilri b/dspace/config/spiders/agents/ilri new file mode 100644 index 000000000000..9b3ebae095eb --- /dev/null +++ b/dspace/config/spiders/agents/ilri @@ -0,0 +1,35 @@ +Delphi +centuryb\.o\.t9 +RI\/1\.0 +PostmanRuntime +node-fetch +Photon +StatusCake_Pagespeed_indev +node-superagent +cortex +FlipboardProxy +nettle +GARDIAN +randint +scalaj-http +scpitspi-rs +lua-resty-http +AHC +acebookexternalhit +Iframely +qbhttp +^got +^colly +article-parser +1science +Moreover\/\d +Nutch-\d +Exploratodo\/\d +Crowsnest\/\d +metha\/\d +FullStoryBot +SomeRandomText +RestSharp +MetaInspector +ubermetrics +insomnia From 08a01136cf27102469217bd0bd11e5cb3aea87a7 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 6 Sep 2022 16:12:23 +0300 Subject: [PATCH 006/119] dspace/config: adjust usage statistics Sync settings with DSpace 6. We do not want to log bot requests, and we want to match user agents case insensitively. --- dspace/config/modules/usage-statistics.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/modules/usage-statistics.cfg b/dspace/config/modules/usage-statistics.cfg index c77bb1ca78a3..199992617776 100644 --- a/dspace/config/modules/usage-statistics.cfg +++ b/dspace/config/modules/usage-statistics.cfg @@ -35,11 +35,11 @@ usage-statistics.authorization.admin.workflow=true # If true, event will be logged with the 'isBot' field set to true # (see query.filter.* for query filter options) # Default value is true. -#usage-statistics.logBots = true +usage-statistics.logBots = false # Enable/disable if a matching for a bot should be case sensitive # Setting this value to true will increase cpu usage, but bots will be found more accurately -#usage-statistics.bots.case-insensitive = false +usage-statistics.bots.case-insensitive = true # Set to true if the statistics core is sharded into a core per year, defaults to false # If you are sharding your statistics index each year by running "dspace stats-util -s", you should set this to "true" From 35708c2c6e45d78aa8e4ad0eb8a413d519ae4380 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 1 Nov 2022 21:07:03 +0300 Subject: [PATCH 007/119] dspace.cfg: override ImageMagick density Use a "2x" supersample by overriding the ImageMagick density from 72 to 144. This creates higher-quality PDF thumbnails. --- dspace/config/dspace.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 05615075d31a..46a9f3caed62 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -555,7 +555,7 @@ filter.org.dspace.app.mediafilter.PDFBoxThumbnail.inputFormats = Adobe PDF # bnails. Greatly increases quality of resulting thumbnails, at the expense of # slightly longer execution times and higher memory usage. Any integer over 72 # will help, but recommend 144 for a "2x" supersample. -# org.dspace.app.mediafilter.ImageMagickThumbnailFilter.density = 144 +org.dspace.app.mediafilter.ImageMagickThumbnailFilter.density = 144 #### Crosswalk and Packager Plugin Settings #### # Crosswalks are used to translate external metadata formats into DSpace's internal format (DIM) From e2808fd99c6367df1cf294701f015a915e95917d Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 8 Dec 2022 18:41:46 +0200 Subject: [PATCH 008/119] dspace.cfg: enable eperson.subscription.onlynew We only want to send subscription mails for new items, not modified ones. --- dspace/config/dspace.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 46a9f3caed62..f4e5fd9ccf67 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -901,6 +901,9 @@ org.dspace.app.batchitemimport.work.dir = ${dspace.dir}/imports # default = false, (disabled) #org.dspace.content.Collection.findAuthorizedPerformanceOptimize = true +# For backwards compatibility, the subscription emails by default include any modified items +# uncomment the following entry for only new items to be emailed +eperson.subscription.onlynew = true # Identifier providers. # Following are configuration values for the EZID DOI provider, with appropriate From 1a47dd7bdccbac50699c137a24512d59b980130c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 11 Dec 2022 21:37:22 +0300 Subject: [PATCH 009/119] dspace/config: enable LDAP authentication --- dspace/config/modules/authentication.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/modules/authentication.cfg b/dspace/config/modules/authentication.cfg index 568f871e3cd7..9868b42d6602 100644 --- a/dspace/config/modules/authentication.cfg +++ b/dspace/config/modules/authentication.cfg @@ -44,7 +44,7 @@ #plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.IPAuthentication # LDAP authentication/authorization. See authentication-ldap.cfg for default configuration. -#plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.LDAPAuthentication +plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.LDAPAuthentication # Shibboleth authentication/authorization. See authentication-shibboleth.cfg for default configuration. #plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.ShibAuthentication From a3829bf6e93e3b6bc8c096f264f7e07c0a46d656 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 27 Feb 2023 21:33:17 +0300 Subject: [PATCH 010/119] Add IDs to controlled vocabularies These behave differently in DSpace 7 than in DSpace 6 when the IDs are missing (you get infinitely repeating values in the submission form dropdown). --- .../cg-contributor-affiliation.xml | 2000 ++++---- .../cg-contributor-donor.xml | 1566 +++---- .../cg-coverage-subregion.xml | 162 +- .../cg-creator-identifier.xml | 2806 ++++++------ .../controlled-vocabularies/cg-journal.xml | 4046 ++++++++--------- .../cg-species-breed.xml | 1760 +++---- .../dc-contributor-author.xml | 3002 ++++++------ 7 files changed, 7671 insertions(+), 7671 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml index 78226ca3de78..809ba58df1ae 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xmldiff --git a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml index 25c960c6fb42..05ee10325302 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-donor.xmldiff --git a/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml b/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml index e1798618054f..164d8bd35982 100644 --- a/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml +++ b/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml @@ -2,86 +2,86 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index 48c7634d841e..b081b31883d1 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xmldiff --git a/dspace/config/controlled-vocabularies/cg-journal.xml b/dspace/config/controlled-vocabularies/cg-journal.xml index 80b76d1fb46e..03403765acc1 100644 --- a/dspace/config/controlled-vocabularies/cg-journal.xml +++ b/dspace/config/controlled-vocabularies/cg-journal.xmldiff --git a/dspace/config/controlled-vocabularies/cg-species-breed.xml b/dspace/config/controlled-vocabularies/cg-species-breed.xml index 1095562b8dcf..d1844dd7c727 100644 --- a/dspace/config/controlled-vocabularies/cg-species-breed.xml +++ b/dspace/config/controlled-vocabularies/cg-species-breed.xmldiff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index 647ed6dc357b..f5ea865e4ca6 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xmlrom 92948e35b6de28be13117205b948cce97e725244 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 28 Feb 2023 19:50:02 +0300 Subject: [PATCH 011/119] Add IDs to dcterms-subject.xml --- .../dcterms-subject.xml | 2818 ++++++++--------- 1 file changed, 1409 insertions(+), 1409 deletions(-) diff --git a/dspace/config/controlled-vocabularies/dcterms-subject.xml b/dspace/config/controlled-vocabularies/dcterms-subject.xml index 73fab2b5a98e..03fb2120a81e 100644 --- a/dspace/config/controlled-vocabularies/dcterms-subject.xml +++ b/dspace/config/controlled-vocabularies/dcterms-subject.xmlrom a34b6e9558695af1aace5eb369dd55560040377b Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 2 Mar 2023 15:59:09 +0300 Subject: [PATCH 012/119] discovery.xml: add some CGSpace filters --- dspace/config/spring/api/discovery.xml | 80 ++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index fb25f11598fa..5e5e69f5ff09 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -165,8 +165,10 @@ - - + + + + @@ -178,15 +180,10 @@ - - - - - - - - - + + + + @@ -2848,6 +2845,67 @@ + + + + + + cg.coverage.country + + + + + + + + + + + + + + + cg.coverage.region + + + + + + + + + + + + + + + dcterms.accessRights + + + + + + + + + + + + + + + dcterms.license + + + + + + + + + + From 2950ca618a49ee5ce8bdec802455a243d82f0c52 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 2 Mar 2023 15:59:27 +0300 Subject: [PATCH 013/119] discovery.xml: update default filters Our subjects are in dcterms.subject and we don't use hierarchical subjects. Also, change the date issued field. --- dspace/config/spring/api/discovery.xml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 5e5e69f5ff09..929c3c19d23c 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -2317,17 +2317,16 @@ - + - dc.subject.* + dcterms.subject - @@ -2335,7 +2334,7 @@ - dc.date.issued + dcterms.issued From fa9d51786306e3e976fd9e7f929b7a9313dd89f6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 2 Mar 2023 16:51:12 +0300 Subject: [PATCH 014/119] discovery.xml: more filter adjustments Use dcterms.type for Type and add a few more filters from CGSpace. --- dspace/config/spring/api/discovery.xml | 40 ++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 929c3c19d23c..e1183daf68c9 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -163,8 +163,10 @@ - + + + @@ -178,8 +180,10 @@ - + + + @@ -2379,7 +2383,7 @@ - dc.type + dcterms.type @@ -2905,6 +2909,36 @@ + + + + + cg.contributor.initiative + + + + + + + + + + + + + + + cg.contributor.affiliation + + + + + + + + + + From a6994662b6e5a935c30ff6b58d78f39d45fdd82e Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 2 Mar 2023 20:34:06 +0300 Subject: [PATCH 015/119] discovery.xml: add more facets --- dspace/config/spring/api/discovery.xml | 73 +++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index e1183daf68c9..c1fdc1351c0c 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -166,7 +166,12 @@ - + + + + + + @@ -183,6 +188,12 @@ + + + + + + @@ -2939,6 +2950,66 @@ + + + + + cg.contributor.crp + + + + + + + + + + + + cg.contributor.donor + + + + + + + + + + + + cg.subject.sdg + + + + + + + + + + + + cg.subject.impactArea + + + + + + + + + + + + cg.subject.actionArea + + + + + + + From 2b6cc7b8789048fc39d76d80568f2157c1311c43 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 3 Mar 2023 12:50:32 +0300 Subject: [PATCH 016/119] dspace/config/spiders: update agents Update from latest COUNTER-Robots list, removing from the local ILRI overrides where needed. --- dspace/config/spiders/agents/ilri | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/dspace/config/spiders/agents/ilri b/dspace/config/spiders/agents/ilri index 9b3ebae095eb..4f1da09aa0ef 100644 --- a/dspace/config/spiders/agents/ilri +++ b/dspace/config/spiders/agents/ilri @@ -1,5 +1,4 @@ Delphi -centuryb\.o\.t9 RI\/1\.0 PostmanRuntime node-fetch @@ -8,7 +7,6 @@ StatusCake_Pagespeed_indev node-superagent cortex FlipboardProxy -nettle GARDIAN randint scalaj-http @@ -29,7 +27,16 @@ Crowsnest\/\d metha\/\d FullStoryBot SomeRandomText -RestSharp -MetaInspector ubermetrics -insomnia +curb +bitdiscovery +omgili +Vizzit +Java\/17-ea +AdobeUxTechC4-Async +ZaloPC-win32-24v473 +nbertaupete95 +Scoop\.it +WebAPIClient +RStudio +^MEL From 2d89cec331183478f4bda308ee9f36cf65f08478 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 10 Mar 2023 14:08:10 +0300 Subject: [PATCH 017/119] dspace.cfg: increase thumbnail size to 500px It has been ten years since we last increased the thumbnail size on CGSpace. 300px has served us well, but higher-density displays and new uses for thumbnails call for larger sizes. For reference, I checked 300 - 600px to see the implications on file size for one portrait layout PDF: - 212x300px: 31 kilobytes <--- current - 283x400px: 52 kilobytes - 354x500px: 76 kilobytes - 424x600px: 106 kilobytes I think 500px is a good compromise. --- dspace/config/dspace.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index f4e5fd9ccf67..6fea2e3dbeb2 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1031,8 +1031,8 @@ cc.license.locale = en # Maximum width and height (in pixels) of generated thumbnails # NOTE: In the UI's base theme, `--ds-thumbnail-max-width` defaults to 175px. # So, if you set 'thumbnail.maxwidth' >175, you may wish to modify that UI style variable as well. -thumbnail.maxwidth = 300 -thumbnail.maxheight = 300 +thumbnail.maxwidth = 500 +thumbnail.maxheight = 500 # Blur before scaling. A little blur before scaling does wonders for keeping # more in check. (Only used by JPEGFilter) From 0e6830b4d588ee9647073533a9134b454607b229 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 31 May 2023 08:52:48 +0300 Subject: [PATCH 018/119] dspace: use CG Core fields for CrossRef import --- .../config/spring/api/crossref-integration.xml | 18 +++++++++--------- dspace/config/spring/api/external-services.xml | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dspace/config/spring/api/crossref-integration.xml b/dspace/config/spring/api/crossref-integration.xml index d1e416d2b0c6..290867068b4d 100644 --- a/dspace/config/spring/api/crossref-integration.xml +++ b/dspace/config/spring/api/crossref-integration.xml @@ -38,7 +38,7 @@ - + @@ -46,7 +46,7 @@ - + @@ -54,7 +54,7 @@ - + @@ -76,7 +76,7 @@ - + @@ -84,7 +84,7 @@ - + @@ -111,7 +111,7 @@ - + @@ -119,7 +119,7 @@ - + @@ -127,7 +127,7 @@ - + @@ -135,7 +135,7 @@ - + diff --git a/dspace/config/spring/api/external-services.xml b/dspace/config/spring/api/external-services.xml index 6d7d50c39f1b..12d852f73b92 100644 --- a/dspace/config/spring/api/external-services.xml +++ b/dspace/config/spring/api/external-services.xml @@ -107,7 +107,7 @@ - + Publication From 1ef63be463687c95e6531e1534ed39aade0b57da Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 31 May 2023 08:53:19 +0300 Subject: [PATCH 019/119] dspace: use cg.identifier.doi for Datacite import We don't use Datacite yet, but this is easy to change now so it can possibly work in the future. --- dspace/config/spring/api/datacite-integration.xml | 2 +- dspace/config/spring/api/external-services.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/spring/api/datacite-integration.xml b/dspace/config/spring/api/datacite-integration.xml index 236ec0a3bda9..911d5bec3a29 100644 --- a/dspace/config/spring/api/datacite-integration.xml +++ b/dspace/config/spring/api/datacite-integration.xml @@ -56,7 +56,7 @@ - + diff --git a/dspace/config/spring/api/external-services.xml b/dspace/config/spring/api/external-services.xml index 12d852f73b92..f3391ce35bda 100644 --- a/dspace/config/spring/api/external-services.xml +++ b/dspace/config/spring/api/external-services.xml @@ -215,7 +215,7 @@ - + Publication From fab1133770119df3eb800d72f4f61564d8bc40d3 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 31 May 2023 08:57:10 +0300 Subject: [PATCH 020/119] dspace: use CG Core fields for PubMed import We are not using this yet, but it will surely come in handy. --- .../config/spring/api/pubmed-integration.xml | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/dspace/config/spring/api/pubmed-integration.xml b/dspace/config/spring/api/pubmed-integration.xml index adec4456ea03..c2ba41b5a867 100644 --- a/dspace/config/spring/api/pubmed-integration.xml +++ b/dspace/config/spring/api/pubmed-integration.xml @@ -18,17 +18,17 @@ what metadatafield is generated. - + - - - + + + - + @@ -44,7 +44,7 @@ - + @@ -58,7 +58,7 @@ - + @@ -87,30 +87,30 @@ - + - + - + - + - + @@ -119,8 +119,8 @@ - - + + @@ -128,17 +128,17 @@ - - + + - - + + - - + + From 29907dd79568673ada6a5c56947226cfcbbe4cab Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 13 Jun 2023 14:59:17 +0300 Subject: [PATCH 021/119] Add dspace/config/default.license Copied directly from CGSpace's DSpace 6 repository. --- dspace/config/default.license | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/dspace/config/default.license b/dspace/config/default.license index 390e9786688d..57f743aca7cc 100644 --- a/dspace/config/default.license +++ b/dspace/config/default.license @@ -1,18 +1,13 @@ -NOTE: PLACE YOUR OWN LICENSE HERE -This sample license is provided for informational purposes only. +By signing and submitting this license, you (the author(s) or copyright owner) grants to the curators of CGSpace (see https://cgspace.cgiar.org/page/about) the non-exclusive right to reproduce, translate (as defined below), and/or distribute your submission (including the abstract) worldwide in print and electronic format and in any medium, including but not limited to audio or video. -NON-EXCLUSIVE DISTRIBUTION LICENSE +You agree that ILRI (on behalf of the CGSpace group) may, without changing the content, translate the submission to any medium or format for the purpose of preservation. -By signing and submitting this license, you (the author(s) or copyright owner) grants to DSpace University (DSU) the non-exclusive right to reproduce, translate (as defined below), and/or distribute your submission (including the abstract) worldwide in print and electronic format and in any medium, including but not limited to audio or video. +You also agree that ILRI (on behalf of the CGSpace group) may keep more than one copy of this submission for purposes of security, back-up and preservation. -You agree that DSU may, without changing the content, translate the submission to any medium or format for the purpose of preservation. +You represent that the submission is your original work, and that you have the right to grant these rights. You also represent that your submission does not, to the best of your knowledge, infringe upon anyone's copyright. -You also agree that DSU may keep more than one copy of this submission for purposes of security, back-up and preservation. +If the submission contains material for which you do not hold copyright, you represent that you have obtained the unrestricted permission of the copyright owner to grant ILRI (on behalf of the CGSpace group) the rights required by this license, and that such third-party owned material is clearly identified and acknowledged within the text or content of the submission. -You represent that the submission is your original work, and that you have the right to grant the rights contained in this license. You also represent that your submission does not, to the best of your knowledge, infringe upon anyone's copyright. +IF THE SUBMISSION IS BASED UPON WORK THAT HAS BEEN SPONSORED OR SUPPORTED BY AN AGENCY OR ORGANIZATION OTHER THAN ILRI OR THE CGSPACE GROUP, YOU REPRESENT THAT YOU HAVE FULFILLED ANY RIGHT OF REVIEW OR OTHER OBLIGATIONS REQUIRED BY SUCH CONTRACT OR AGREEMENT. -If the submission contains material for which you do not hold copyright, you represent that you have obtained the unrestricted permission of the copyright owner to grant DSU the rights required by this license, and that such third-party owned material is clearly identified and acknowledged within the text or content of the submission. - -IF THE SUBMISSION IS BASED UPON WORK THAT HAS BEEN SPONSORED OR SUPPORTED BY AN AGENCY OR ORGANIZATION OTHER THAN DSU, YOU REPRESENT THAT YOU HAVE FULFILLED ANY RIGHT OF REVIEW OR OTHER OBLIGATIONS REQUIRED BY SUCH CONTRACT OR AGREEMENT. - -DSU will clearly identify your name(s) as the author(s) or owner(s) of the submission, and will not make any alteration, other than as allowed by this license, to your submission. +ILRI (on behalf of the CGSpace group) will clearly identify your name(s) as the author(s) or owner(s) of the submission, and will not make any alteration, other than as allowed by this license, to your submission. From b1a9efddb5784a667462c7f3ecb160a4f4f4e05f Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 14 Jun 2023 08:47:13 +0300 Subject: [PATCH 022/119] dspace/config: import submission forms again Hopefully easier to maintain during future rebases now. --- dspace/config/item-submission.xml | 6 + dspace/config/submission-forms.xml | 4563 +++++++++++++++++++++++++++- 2 files changed, 4425 insertions(+), 144 deletions(-) diff --git a/dspace/config/item-submission.xml b/dspace/config/item-submission.xml index 1060a3303119..f5fcef7266d9 100644 --- a/dspace/config/item-submission.xml +++ b/dspace/config/item-submission.xml @@ -109,6 +109,11 @@ org.dspace.app.rest.submit.step.DescribeStep submission-form + + submit.progressbar.describe.stepthree + org.dspace.app.rest.submit.step.DescribeStep + submission-form + submit.progressbar.describe.stepone org.dspace.app.rest.submit.step.DescribeStep @@ -261,6 +266,7 @@ + diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 39a4778356c0..f531bbedd82c 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -51,132 +51,393 @@
+ + + dc + title + + false + + onebox + Enter the full official title of the item (including any subtitles, etc). + You must enter a main title for this item. + + + + + dcterms + alternative + + true + + onebox + Enter any alternative titles for the item (other languages, etc). + + + + + + dcterms + type + + false + + dropdown + Select the type of content of the item. + You must select item type. + + +
+ +
dc contributor author true - + + dc-contributor-author onebox - Enter the author's name (Family name, Given names). + Enter the names of individual or corporate authors of this item (enter each individually). + Please enter the author(s). + + + + + cg + creator + identifier + true + + cg-creator-identifier + onebox + Enter ORCID identifiers for CGIAR authors, one per author. If an identifier is missing from the list, enter a new one in the exact same format (Name: 0000-0002-1735-7458). Use the exact name style the author uses at https://orcid.org - dc - title + cg + contributor + affiliation + true + + cg-contributor-affiliation + onebox + Enter the full name of the institutions that the author(s) work for (enter each individually). + + + + cg + contributor + donor + true + + onebox + cg-contributor-donor + Enter full institution name(s) who sponsored the item. Click below to see a pre-populated list, or add one manually. CGIAR Research Program funding is identified in the field above. + + + + + + cg + authorship + types + true + + dropdown + Characterize the entire authorship based on author affiliations. + + + + + + cg + journal false - + + cg-journal onebox - Enter the main title of the item. - You must enter a main title for this item. + Enter the full journal title. + Journal Article,Data Paper + - dc - title - alternative - true - + cg + volume + + false + + onebox - If the item has any alternative titles, please enter them here. + Enter the journal volume. For example, if published in PLoS ONE 16(1), the volume is: 16 + Journal Article,Data Paper + + + + cg + issue + + false + + onebox + + Enter the journal issue. For example, if published in PLoS ONE 16(1), the issue number is: 1 + Journal Article,Data Paper - dc - date - issued + cg + edition + false - - + + + onebox + Enter the edition of the item (3rd, 17th, Revised). + + + + dcterms + extent + + false + + + onebox + Enter the page range for an article or chapter (p. 10-17); or the total pages for a report (120 p.). + + + + + + dcterms + issued + + false + date - Please give the date of previous publication or public distribution. - You can leave out the day and/or month if they aren't applicable. - + Enter the date when the item was officially published or made public. For journal articles, this is the date the article was included in a volume/issue, if known. If the issue date is not known, please use the online first date here. You must enter at least the year. - dc + dcterms + available + + false + + date + Enter the date when the item was available online. Use this to indicate a journal article was online first, and use the publication date for the issue date. + Journal Article,Data Paper + + + + + + dcterms publisher false - onebox - Enter the name of the publisher of the previously issued instance of this item. + Enter the full name of the publisher. - dc - identifier - citation + dcterms + bibliographicCitation + false - + onebox - Enter the standard citation for the previously issued instance of this item. - + Enter the standard citation. + Standard citation is required. - dc - relation - ispartofseries + dcterms + isPartOf + + true + + + onebox + Enter the full official series/report name, for example CCAFS Info Note or ILRI Research Report. + + + + cg + number + true - series - Enter the series and number assigned to this item by your community. + + onebox + Enter the series/report number. - dc - identifier + dcterms + abstract + + false + + textarea + Enter the abstract of the item. + + + + + + dcterms + accessRights + + false + + dropdown + Indicate the accessibility of the item. If the item is free to read this should be "Open Access", even if it is copyrighted. + Access rights is required. + + + dcterms + license + false + + dropdown + Indicate the usage rights of the item. For most items this will be a Creative Commons license. Look on the publisher's page or in the PDF itself if available. Choose "Other" if the item does not have a license or none is specified. + Usage rights is required. + + + + + cg + isijournal + + false + + Indicate if the item was published in an ISI journal. + dropdown + + Journal Article,Data Paper + + + + + cg + howPublished + + false + + Indicate whether the item was formally published or not. In general, any output other than journal articles and books are grey literature. + dropdown + + + + cg + reviewStatus + + false + + Indicate whether the item has undergone internal or peer review. + dropdown + + + + + + dcterms + audience - true - - qualdrop_value - If the item has any identification numbers or codes associated with - it, please enter the types and the actual numbers or codes. - + + Indicate the main audience for whom the item is produced. + dropdown - dc - type + cg + isbn + false + + + onebox + Enter the ISBN for the item, for example: 978-3-16-148410-0 + + + + cg + issn + + false + + + onebox + Enter the ISSN for the serial publication where this item appears, for example: 2049-3630 + + + + + + cg + identifier + url true - - dropdown - Select the type of content of the item. - + + + onebox + Enter a full URL for the item, for example a website, blog post, Google Books URL, etc (do not use this for links to datasets, use data URL instead). + + + + cg + identifier + doi + true + + + onebox + Enter the full address in format: https://doi.org/10.1038/s41598-019-43406-0 + + + + cg + identifier + dataurl + true + + + onebox + A URL for any associated data file(s), in a repository for example. - dc + dcterms language - iso - false + + true dropdown Select the language of the main content of the item. If the language does not appear in the list, please select 'Other'. If the content does not really have a language (for example, if it - is a dataset or an image) please select 'N/A'. + is a dataset or an image) please leave this blank. @@ -185,57 +446,288 @@ - + - dc + cg + contributor + initiative + true + + dropdown + Select any CGIAR Initiatives(s) associated with this item. Use this to show that an Initiative funded this item. + + + + + + cg + contributor + crp + true + + dropdown + Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. + + + + + + cg + subject + sdg + true + + dropdown + Select any UN Sustainable Development Goals associated with this item. + + + + + + cg + subject + impactArea + true + + dropdown + Select any CGIAR Impact Areas associated with this item. + + + + cg + subject + actionArea + true + + dropdown + Select any CGIAR Action Areas associated with this item. + + + + + + cg + coverage + country + true + + dropdown + Select a country or countries within the scope of the item. + + + + + + cg + coverage + region + true + + dropdown + Select a region or regions within the scope of the item. + + + + cg + coverage + subregion + true + + onebox + cg-coverage-subregion + Enter the subregion. Normally provinces, states, regions, etc WITHIN a country according to ISO 3166-2 subdivisions. + + + + + + cg + river + basin + true + + dropdown + Select the focus basin(s). + + + + + + dcterms subject - true - + tag - Enter appropriate subject keywords or phrases. + Enter AGROVOC subjects in lower case. - srsc + dcterms-subject - dc - description - abstract - false - - textarea - Enter the abstract of the item. + cg + identifier + iitatheme + true + + dropdown + + + + + cg + subject + iita + true + + dropdown + - dc - description - sponsorship + cg + identifier + iwmilibrary false - - textarea - Enter the names of any sponsors and/or funding codes in the box. + + onebox + IWMI Library internal reference (eg H049940) - dc + cg + subject + alliancebiovciat + true + + dropdown + + + + + + + cg + subject + cip + true + + dropdown + + + + + + + cg + subject + ilri + true + + dropdown + + + + + + + dcterms description false - + textarea Enter any other description or comments in this box. + + + cg + link + video + false + + + onebox + Enter related video link (eg a video about this item). + + + + cg + link + audio + false + + + onebox + Enter related audio link (eg an audio about this item). + + + + cg + link + photo + false + + + onebox + Enter related photo link (eg a photo or image associated with this item). + + + + + + dcterms + relation + + false + + onebox + Enter related reference link (normally a URL to another item). + + + + + + cg + link + citation + false + + onebox + Enter related reference citation (normally a full citation, not a URL). + + + + + + cg + species + + true + + onebox + Enter scientific name of organism, plant, animal, etc if it is a main focus of the item, for example: Lablab purpureus, Bos Taurus, Oryza sativa, Theileria parva + + + + cg + species + breed + true + + onebox + cg-species-breed + + + @@ -1394,149 +1886,3932 @@ - Animation - Animation + Abstract + Abstract + + + Audio + Audio - Article - Article + Blog Post + Blog Post Book Book - Book chapter - Book chapter + Book Chapter + Book Chapter + + + Brief + Brief + + + Brochure + Brochure + + + Case Study + Case Study + + + Conference Paper + Conference Paper + + + Conference Proceedings + Conference Proceedings + + + Data Paper + Data Paper Dataset Dataset - Learning Object - Learning Object + Equation + Equation + + + Extension Material + Extension Material Image Image - Image, 3-D - Image, 3-D + Infographic + Infographic - Map - Map + Internal Document + Internal Document - Musical Score - Musical Score + Journal Article + Journal Article - Plan or blueprint - Plan or blueprint + Journal Item + Journal Item - Preprint - Preprint + Logo + Logo - Presentation - Presentation - - - Recording, acoustical - Recording, acoustical + Manual + Manual - Recording, musical - Recording, musical + Manuscript-unpublished + Manuscript-unpublished - Recording, oral - Recording, oral + Map + Map - Software - Software + News Item + News Item - Technical Report - Technical Report + Newsletter + Newsletter - Thesis - Thesis + Opinion Piece + Opinion Piece - Video - Video + Photo Report + Photo Report - Working Paper - Working Paper + Poster + Poster - Other - Other + Presentation + Presentation - - - - - N/A - + Press Item + Press Item - English (United States) - en_US + Report + Report - English - en + Software + Software - Spanish - es + Source Code + Source Code - German - de + Template + Template - French - fr + Thesis + Thesis - Italian - it + Training Material + Training Material - Japanese - ja + Video + Video - Chinese - zh + Website + Website - Portuguese - pt + Working Paper + Working Paper - Turkish - tr + Wiki + Wiki - (Other) - other + Other + Other + + + + Afar + aa + + + Abkhazian + ab + + + Afrikaans + af + + + Akan + ak + + + Amharic + am + + + Arabic + ar + + + Aragonese + an + + + Assamese + as + + + Avaric + av + + + Avestan + ae + + + Aymara + ay + + + Azerbaijani + az + + + Bashkir + ba + + + Bambara + bm + + + Belarusian + be + + + Bengali + bn + + + Bislama + bi + + + Tibetan + bo + + + Bosnian + bs + + + Breton + br + + + Bulgarian + bg + + + Catalan + ca + + + Czech + cs + + + Chamorro + ch + + + Chechen + ce + + + Church Slavic + cu + + + Chuvash + cv + + + Cornish + kw + + + Corsican + co + + + Cree + cr + + + Welsh + cy + + + Danish + da + + + German + de + + + Dhivehi + dv + + + Dzongkha + dz + + + Modern Greek (1453-) + el + + + English + en + + + Esperanto + eo + + + Estonian + et + + + Basque + eu + + + Ewe + ee + + + Faroese + fo + + + Persian + fa + + + Fijian + fj + + + Finnish + fi + + + French + fr + + + Western Frisian + fy + + + Fulah + ff + + + Scottish Gaelic + gd + + + Irish + ga + + + Galician + gl + + + Manx + gv + + + Guarani + gn + + + Gujarati + gu + + + Haitian + ht + + + Hausa + ha + + + Serbo-Croatian + sh + + + Hebrew + he + + + Herero + hz + + + Hindi + hi + + + Hiri Motu + ho + + + Croatian + hr + + + Hungarian + hu + + + Armenian + hy + + + Igbo + ig + + + Ido + io + + + Sichuan Yi + ii + + + Inuktitut + iu + + + Interlingue + ie + + + Interlingua (International Auxiliary Language Association) + ia + + + Indonesian + id + + + Inupiaq + ik + + + Icelandic + is + + + Italian + it + + + Javanese + jv + + + Japanese + ja + + + Kalaallisut + kl + + + Kannada + kn + + + Kashmiri + ks + + + Georgian + ka + + + Kanuri + kr + + + Kazakh + kk + + + Central Khmer + km + + + Kikuyu + ki + + + Kinyarwanda + rw + + + Kirghiz + ky + + + Komi + kv + + + Kongo + kg + + + Korean + ko + + + Kuanyama + kj + + + Kurdish + ku + + + Lao + lo + + + Latin + la + + + Latvian + lv + + + Limburgan + li + + + Lingala + ln + + + Lithuanian + lt + + + Luxembourgish + lb + + + Luba-Katanga + lu + + + Ganda + lg + + + Marshallese + mh + + + Malayalam + ml + + + Marathi + mr + + + Macedonian + mk + + + Malagasy + mg + + + Maltese + mt + + + Mongolian + mn + + + Maori + mi + + + Malay (macrolanguage) + ms + + + Burmese + my + + + Nauru + na + + + Navajo + nv + + + South Ndebele + nr + + + North Ndebele + nd + + + Ndonga + ng + + + Nepali (macrolanguage) + ne + + + Dutch + nl + + + Norwegian Nynorsk + nn + + + Norwegian Bokmål + nb + + + Norwegian + no + + + Nyanja + ny + + + Occitan (post 1500) + oc + + + Ojibwa + oj + + + Oriya (macrolanguage) + or + + + Oromo + om + + + Ossetian + os + + + Panjabi + pa + + + Pali + pi + + + Polish + pl + + + Portuguese + pt + + + Pushto + ps + + + Quechua + qu + + + Romansh + rm + + + Romanian + ro + + + Rundi + rn + + + Russian + ru + + + Sango + sg + + + Sanskrit + sa + + + Sinhala + si + + + Slovak + sk + + + Slovenian + sl + + + Northern Sami + se + + + Samoan + sm + + + Shona + sn + + + Sindhi + sd + + + Somali + so + + + Southern Sotho + st + + + Spanish + es + + + Albanian + sq + + + Sardinian + sc + + + Serbian + sr + + + Swati + ss + + + Sundanese + su + + + Swahili (macrolanguage) + sw + + + Swedish + sv + + + Tahitian + ty + + + Tamil + ta + + + Tatar + tt + + + Telugu + te + + + Tajik + tg + + + Tagalog + tl + + + Thai + th + + + Tigrinya + ti + + + Tonga (Tonga Islands) + to + + + Tswana + tn + + + Tsonga + ts + + + Turkmen + tk + + + Turkish + tr + + + Twi + tw + + + Uighur + ug + + + Ukrainian + uk + + + Urdu + ur + + + Uzbek + uz + + + Venda + ve + + + Vietnamese + vi + + + Volapük + vo + + + Walloon + wa + + + Wolof + wo + + + Xhosa + xh + + + Yiddish + yi + + + Yoruba + yo + + + Zhuang + za + + + Chinese + zh + + + Zulu + zu + + + (Other) + other + + + + + Accelerated Breeding + Accelerated Breeding + + + AgriLAC Resiliente + AgriLAC Resiliente + + + Agroecology + Agroecology + + + Aquatic Foods + Aquatic Foods + + + Asian Mega-Deltas + Asian Mega-Deltas + + + Breeding Resources + Breeding Resources + + + Climate Resilience + Climate Resilience + + + Digital Innovation + Digital Innovation + + + Diversification in East and Southern Africa + Diversification in East and Southern Africa + + + Excellence in Agronomy + Excellence in Agronomy + + + Foresight + Foresight + + + Fragility, Conflict, and Migration + Fragility, Conflict, and Migration + + + Fragility to Resilience in Central and West Asia and North Africa + Fragility to Resilience in Central and West Asia and North Africa + + + Fruits and Vegetables + Fruits and Vegetables + + + Gender Equality + Gender Equality + + + Genebanks + Genebanks + + + Livestock and Climate + Livestock and Climate + + + Low-Emission Food Systems + Low-Emission Food Systems + + + Market Intelligence + Market Intelligence + + + Mixed Farming Systems + Mixed Farming Systems + + + NEXUS Gains + NEXUS Gains + + + National Policies and Strategies + National Policies and Strategies + + + Nature-Positive Solutions + Nature-Positive Solutions + + + One Health + One Health + + + Plant Health + Plant Health + + + Resilient Cities + Resilient Cities + + + Rethinking Food Markets + Rethinking Food Markets + + + Seed Equal + Seed Equal + + + Sustainable Animal Productivity + Sustainable Animal Productivity + + + Sustainable Healthy Diets + Sustainable Healthy Diets + + + Transforming Agrifood Systems in South Asia + Transforming Agrifood Systems in South Asia + + + West and Central African Food Systems Transformation + West and Central African Food Systems Transformation + + + + + Agriculture for Nutrition and Health + Agriculture for Nutrition and Health + + + Big Data + Big Data + + + Climate Change, Agriculture and Food Security + Climate Change, Agriculture and Food Security + + + Excellence in Breeding + Excellence in Breeding + + + Fish + Fish + + + Forests, Trees and Agroforestry + Forests, Trees and Agroforestry + + + Gender + Gender + + + Genebanks + Genebanks + + + Grain Legumes and Dryland Cereals + Grain Legumes and Dryland Cereals + + + Livestock + Livestock + + + Maize + Maize + + + Policies, Institutions, and Markets + Policies, Institutions, and Markets + + + Rice + Rice + + + Roots, Tubers and Bananas + Roots, Tubers and Bananas + + + Water, Land and Ecosystems + Water, Land and Ecosystems + + + Wheat + Wheat + + + Aquatic Agricultural Systems + Aquatic Agricultural Systems + + + Dryland Cereals + Dryland Cereals + + + Dryland Systems + Dryland Systems + + + Grain Legumes + Grain Legumes + + + Integrated Systems for the Humid Tropics + Integrated Systems for the Humid Tropics + + + Livestock and Fish + Livestock and Fish + + + + + SDG 1 - No poverty + SDG 1 - No poverty + + + SDG 2 - Zero hunger + SDG 2 - Zero hunger + + + SDG 3 - Good health and well-being + SDG 3 - Good health and well-being + + + SDG 4 - Quality education + SDG 4 - Quality education + + + SDG 5 - Gender equality + SDG 5 - Gender equality + + + SDG 6 - Clean water and sanitation + SDG 6 - Clean water and sanitation + + + SDG 7 - Affordable and clean energy + SDG 7 - Affordable and clean energy + + + SDG 8 - Decent work and economic growth + SDG 8 - Decent work and economic growth + + + SDG 9 - Industry, innovation and infrastructure + SDG 9 - Industry, innovation and infrastructure + + + SDG 10 - Reduced inequalities + SDG 10 - Reduced inequalities + + + SDG 11 - Sustainable cities and communities + SDG 11 - Sustainable cities and communities + + + SDG 12 - Responsible consumption and production + SDG 12 - Responsible consumption and production + + + SDG 13 - Climate action + SDG 13 - Climate action + + + SDG 14 - Life below water + SDG 14 - Life below water + + + SDG 15 - Life on land + SDG 15 - Life on land + + + SDG 16 - Peace, justice and strong institutions + SDG 16 - Peace, justice and strong institutions + + + SDG 17 - Partnerships for the goals + SDG 17 - Partnerships for the goals + + + + + Climate adaptation and mitigation + Climate adaptation and mitigation + + + Environmental health and biodiversity + Environmental health and biodiversity + + + Gender equality, youth and social inclusion + Gender equality, youth and social inclusion + + + Nutrition, health and food security + Nutrition, health and food security + + + Poverty reduction, livelihoods and jobs + Poverty reduction, livelihoods and jobs + + + + + Genetic Innovation + Genetic Innovation + + + Resilient Agrifood Systems + Resilient Agrifood Systems + + + Systems Transformation + Systems Transformation + + + + + ACP + ACP + + + Africa + Africa + + + Americas + Americas + + + Asia + Asia + + + Australia and New Zealand + Australia and New Zealand + + + Caribbean + Caribbean + + + Central America + Central America + + + Central Asia + Central Asia + + + Channel Islands + Channel Islands + + + Eastern Africa + Eastern Africa + + + Eastern Asia + Eastern Asia + + + Eastern Europe + Eastern Europe + + + Europe + Europe + + + Latin America + Latin America + + + Latin America and the Caribbean + Latin America and the Caribbean + + + Melanesia + Melanesia + + + Micronesia + Micronesia + + + Middle Africa + Middle Africa + + + Middle East + Middle East + + + Northern Africa + Northern Africa + + + Northern America + Northern America + + + Northern Europe + Northern Europe + + + Oceania + Oceania + + + Polynesia + Polynesia + + + Sahel + Sahel + + + South America + South America + + + South-eastern Asia + South-eastern Asia + + + Southern Africa + Southern Africa + + + Southern Asia + Southern Asia + + + Southern Europe + Southern Europe + + + Sub-Saharan Africa + Sub-Saharan Africa + + + West and Central Africa + West and Central Africa + + + Western Africa + Western Africa + + + Western Asia + Western Asia + + + Western Europe + Western Europe + + + + + Alberta + Alberta + + + An Giang + An Giang + + + Andhra Pradesh + Andhra Pradesh + + + Ashanti + Ashanti + + + Assam + Assam + + + Badulla + Badulla + + + Bali + Bali + + + Balochistan + Balochistan + + + Bamako + Bamako + + + Bihar + Bihar + + + British Columbia + British Columbia + + + Cajamarca + Cajamarca + + + California + California + + + Cauca + Cauca + + + Chhattisgarh + Chhattisgarh + + + Dakar + Dakar + + + Delaware + Delaware + + + Florida + Florida + + + Gampaha + Gampaha + + + Gandaki + Gandaki + + + Guanajuato + Guanajuato + + + Gujarat + Gujarat + + + Haryana + Haryana + + + Helmand + Helmand + + + Himachal Pradesh + Himachal Pradesh + + + Houaphan + Houaphan + + + Iowa + Iowa + + + Iringa + Iringa + + + Jambi + Jambi + + + Jharkhand + Jharkhand + + + Kaabong + Kaabong + + + kalimantan + kalimantan + + + Kampala + Kampala + + + Kandal + Kandal + + + Karnali + Karnali + + + Karnataka + Karnataka + + + Kasungu + Kasungu + + + Kerala + Kerala + + + Khon Kaen + Khon Kaen + + + Kunduz + Kunduz + + + Kwazulu-Natal + Kwazulu-Natal + + + Lima + Lima + + + Limpopo + Limpopo + + + Madhya Pradesh + Madhya Pradesh + + + Maharashtra + Maharashtra + + + Maputo + Maputo + + + Masvingo + Masvingo + + + Meghalaya + Meghalaya + + + Mendoza + Mendoza + + + Missouri + Missouri + + + Nabeul + Nabeul + + + Nagaland + Nagaland + + + Nebraska + Nebraska + + + New South Wales + New South Wales + + + New York + New York + + + Odisha + Odisha + + + Osh + Osh + + + Pando + Pando + + + Punjab + Punjab + + + Queensland + Queensland + + + Rajasthan + Rajasthan + + + Rajshahi + Rajshahi + + + Riau + Riau + + + Sabah + Sabah + + + Sahel + Sahel + + + Salyan + Salyan + + + Sindh + Sindh + + + South Australia + South Australia + + + Sulawesi + Sulawesi + + + Tamil Nadu + Tamil Nadu + + + Telangana + Telangana + + + Texas + Texas + + + Tigray + Tigray + + + Turkana + Turkana + + + Uttar Pradesh + Uttar Pradesh + + + Uttarakhand + Uttarakhand + + + Victoria + Victoria + + + Washington + Washington + + + West Bengal + West Bengal + + + Western Cape + Western Cape + + + Yogyakarta + Yogyakarta + + + + + Afghanistan + Afghanistan + + + Albania + Albania + + + Algeria + Algeria + + + American Samoa + American Samoa + + + Andorra + Andorra + + + Angola + Angola + + + Anguilla + Anguilla + + + Antarctica + Antarctica + + + Antigua and Barbuda + Antigua and Barbuda + + + Argentina + Argentina + + + Armenia + Armenia + + + Aruba + Aruba + + + Australia + Australia + + + Austria + Austria + + + Azerbaijan + Azerbaijan + + + Bahamas + Bahamas + + + Bahrain + Bahrain + + + Bangladesh + Bangladesh + + + Barbados + Barbados + + + Belarus + Belarus + + + Belgium + Belgium + + + Belize + Belize + + + Benin + Benin + + + Bermuda + Bermuda + + + Bhutan + Bhutan + + + Bolivia + Bolivia + + + Bosnia and Herzegovina + Bosnia and Herzegovina + + + Botswana + Botswana + + + Bouvet Island + Bouvet Island + + + Brazil + Brazil + + + Brunei Darussalam + Brunei Darussalam + + + Bulgaria + Bulgaria + + + Burkina Faso + Burkina Faso + + + Burundi + Burundi + + + Cambodia + Cambodia + + + Cameroon + Cameroon + + + Canada + Canada + + + Cabo Verde + Cabo Verde + + + Central African Republic + Central African Republic + + + Chad + Chad + + + Chile + Chile + + + China + China + + + Cocos (Keeling) Islands + Cocos (Keeling) Islands + + + Colombia + Colombia + + + Comoros + Comoros + + + Congo + Congo + + + Congo, Democratic Republic of + Congo, Democratic Republic of + + + Cook Islands + Cook Islands + + + Costa Rica + Costa Rica + + + Côte d'Ivoire + Côte d'Ivoire + + + Croatia + Croatia + + + Cuba + Cuba + + + Cyprus + Cyprus + + + Czech Republic + Czech Republic + + + Denmark + Denmark + + + Djibouti + Djibouti + + + Dominica + Dominica + + + Dominican Republic + Dominican Republic + + + Ecuador + Ecuador + + + Egypt + Egypt + + + El Salvador + El Salvador + + + Equatorial Guinea + Equatorial Guinea + + + Eritrea + Eritrea + + + Estonia + Estonia + + + Ethiopia + Ethiopia + + + Fiji + Fiji + + + Finland + Finland + + + France + France + + + Gabon + Gabon + + + Gambia + Gambia + + + Georgia + Georgia + + + Germany + Germany + + + Ghana + Ghana + + + Greece + Greece + + + Grenada + Grenada + + + Guadeloupe + Guadeloupe + + + Guatemala + Guatemala + + + Guinea + Guinea + + + Guinea-Bissau + Guinea-Bissau + + + Guyana + Guyana + + + Haiti + Haiti + + + Honduras + Honduras + + + Hungary + Hungary + + + Iceland + Iceland + + + India + India + + + Indonesia + Indonesia + + + Iran + Iran + + + Iraq + Iraq + + + Ireland + Ireland + + + Israel + Israel + + + Italy + Italy + + + Jamaica + Jamaica + + + Japan + Japan + + + Jordan + Jordan + + + Kazakhstan + Kazakhstan + + + Kenya + Kenya + + + Kiribati + Kiribati + + + Korea, DPR + Korea, DPR + + + Korea, Republic of + Korea, Republic of + + + Kuwait + Kuwait + + + Kyrgyzstan + Kyrgyzstan + + + Laos + Laos + + + Latvia + Latvia + + + Lebanon + Lebanon + + + Lesotho + Lesotho + + + Liberia + Liberia + + + Libya + Libya + + + Lithuania + Lithuania + + + Luxembourg + Luxembourg + + + North Macedonia + North Macedonia + + + Madagascar + Madagascar + + + Malawi + Malawi + + + Malaysia + Malaysia + + + Maldives + Maldives + + + Mali + Mali + + + Malta + Malta + + + Mauritania + Mauritania + + + Mauritius + Mauritius + + + Marshall Islands + Marshall Islands + + + Mexico + Mexico + + + Moldova + Moldova + + + Mongolia + Mongolia + + + Micronesia (Federated States of) + Micronesia (Federated States of) + + + Montenegro + Montenegro + + + Montserrat + Montserrat + + + Morocco + Morocco + + + Mozambique + Mozambique + + + Myanmar + Myanmar + + + Namibia + Namibia + + + Nepal + Nepal + + + Netherlands + Netherlands + + + New Zealand + New Zealand + + + Nicaragua + Nicaragua + + + Niger + Niger + + + Nigeria + Nigeria + + + Norway + Norway + + + Oman + Oman + + + Pakistan + Pakistan + + + Palestine, State of + Palestine, State of + + + Panama + Panama + + + Papua New Guinea + Papua New Guinea + + + Paraguay + Paraguay + + + Peru + Peru + + + Philippines + Philippines + + + Poland + Poland + + + Portugal + Portugal + + + Qatar + Qatar + + + Romania + Romania + + + Russia + Russia + + + Rwanda + Rwanda + + + Saint Kitts and Nevis + Saint Kitts and Nevis + + + Saint Lucia + Saint Lucia + + + Samoa + Samoa + + + Sao Tome and Principe + Sao Tome and Principe + + + Saudi Arabia + Saudi Arabia + + + Senegal + Senegal + + + Serbia + Serbia + + + Seychelles + Seychelles + + + Sierra Leone + Sierra Leone + + + Singapore + Singapore + + + Slovakia + Slovakia + + + Slovenia + Slovenia + + + Solomon Islands + Solomon Islands + + + Somalia + Somalia + + + South Africa + South Africa + + + South Sudan + South Sudan + + + Spain + Spain + + + Sri Lanka + Sri Lanka + + + Sudan + Sudan + + + Suriname + Suriname + + + Eswatini + Eswatini + + + Sweden + Sweden + + + Switzerland + Switzerland + + + Syria + Syria + + + Taiwan + Taiwan + + + Tajikistan + Tajikistan + + + Tanzania + Tanzania + + + Thailand + Thailand + + + Timor-Leste + Timor-Leste + + + Togo + Togo + + + Tokelau + Tokelau + + + Tonga + Tonga + + + Trinidad and Tobago + Trinidad and Tobago + + + Tunisia + Tunisia + + + Türkiye + Türkiye + + + Turkmenistan + Turkmenistan + + + Uganda + Uganda + + + Ukraine + Ukraine + + + United Arab Emirates + United Arab Emirates + + + United Kingdom + United Kingdom + + + United States + United States + + + Uruguay + Uruguay + + + Uzbekistan + Uzbekistan + + + Vanuatu + Vanuatu + + + Vatican City State + Vatican City State + + + Venezuela + Venezuela + + + Vietnam + Vietnam + + + Yemen + Yemen + + + Zambia + Zambia + + + Zimbabwe + Zimbabwe + + + + + AMAZON + AMAZON + + + ANDES + ANDES + + + GANGES + GANGES + + + INDUS + INDUS + + + KAREKH + KAREKH + + + LIMPOPO + LIMPOPO + + + MEKONG + MEKONG + + + NIGER + NIGER + + + NILE + NILE + + + SAO FRANCISCO + SAO FRANCISCO + + + VOLTA + VOLTA + + + YELLOW + YELLOW + + + + + N/A + + + + Internal Review + Internal Review + + + Peer Review + Peer Review + + + + + N/A + + + + CGIAR single centre + CGIAR single centre + + + CGIAR multi-centre + CGIAR multi-centre + + + CGIAR and developing country institute + CGIAR and developing country institute + + + CGIAR and advanced research institute + CGIAR and advanced research institute + + + Consultant + Consultant + + + Not CGIAR developing country institute + Not CGIAR developing country institute + + + Not CGIAR international institute + Not CGIAR international institute + + + + + ANDEAN ROOTS AND TUBERS + ANDEAN ROOTS AND TUBERS + + + BIGDATA + BIGDATA + + + BIODIVERSITY FOR THE FUTURE + BIODIVERSITY FOR THE FUTURE + + + BIOFORTIFICATION + BIOFORTIFICATION + + + BREEDING + BREEDING + + + CLIMATE CHANGE + CLIMATE CHANGE + + + CLIMATE-SMART AGRICULTURE + CLIMATE-SMART AGRICULTURE + + + CROP PROTECTION + CROP PROTECTION + + + CROP AND SYSTEMS SCIENCES CSS + CROP AND SYSTEMS SCIENCES CSS + + + CRYOPRESERVATION + CRYOPRESERVATION + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + GENDER + GENDER + + + GENEBANK + GENEBANK + + + GENETIC RESOURCES + GENETIC RESOURCES + + + GENETICS, GENOMICS AND CROP IMPROVEMENT SCIENCES GGCI + GENETICS, GENOMICS AND CROP IMPROVEMENT SCIENCES GGCI + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INCLUSIVE GROWTH + INCLUSIVE GROWTH + + + NUTRITION + NUTRITION + + + NUTRITIONAL SECURITY + NUTRITIONAL SECURITY + + + POTATO AGRI-FOOD SYSTEMS + POTATO AGRI-FOOD SYSTEMS + + + POTATOES + POTATOES + + + SEED SYSTEMS + SEED SYSTEMS + + + SOCIAL AND NUTRITIONAL SCIENCES SNS + SOCIAL AND NUTRITIONAL SCIENCES SNS + + + SWEETPOTATOES + SWEETPOTATOES + + + SWEETPOTATO AGRI-FOOD SYSTEMS + SWEETPOTATO AGRI-FOOD SYSTEMS + + + + + ADVOCACY + ADVOCACY + + + AGRICULTURE + AGRICULTURE + + + AGRI-HEALTH + AGRI-HEALTH + + + AGROFORESTRY + AGROFORESTRY + + + AFLATOXINS + AFLATOXINS + + + AMR + AMR + + + ANIMAL BREEDING + ANIMAL BREEDING + + + ANIMAL CARE + ANIMAL CARE + + + ANIMAL DISEASES + ANIMAL DISEASES + + + ANIMAL FEEDING + ANIMAL FEEDING + + + ANIMAL HEALTH + ANIMAL HEALTH + + + ANIMAL PRODUCTION + ANIMAL PRODUCTION + + + ANIMAL PRODUCTS + ANIMAL PRODUCTS + + + ANIMAL WELFARE + ANIMAL WELFARE + + + APICULTURE + APICULTURE + + + ASF + ASF + + + BIODIVERSITY + BIODIVERSITY + + + BIOTECHNOLOGY + BIOTECHNOLOGY + + + BIRD FLU + BIRD FLU + + + BREEDS + BREEDS + + + BRUCELLOSIS + BRUCELLOSIS + + + BUFFALO + BUFFALO + + + BUSHMEAT + BUSHMEAT + + + CAPACITY STRENGTHENING + CAPACITY STRENGTHENING + + + CAMELS + CAMELS + + + CATTLE + CATTLE + + + CBPP + CBPP + + + CHICKENS + CHICKENS + + + CLIMATE CHANGE + CLIMATE CHANGE + + + COMMUNICATIONS + COMMUNICATIONS + + + CONSUMPTION + CONSUMPTION + + + COVID19 + COVID19 + + + CROP RESIDUES + CROP RESIDUES + + + CSF + CSF + + + CROP-LIVESTOCK + CROP-LIVESTOCK + + + CROPS + CROPS + + + DAIRYING + DAIRYING + + + DATA + DATA + + + DIAGNOSTICS + DIAGNOSTICS + + + DIET + DIET + + + DISEASE CONTROL + DISEASE CONTROL + + + DROUGHT + DROUGHT + + + DRYLANDS + DRYLANDS + + + ECF + ECF + + + EMERGING DISEASES + EMERGING DISEASES + + + ENVIRONMENT + ENVIRONMENT + + + EPIDEMIOLOGY + EPIDEMIOLOGY + + + EXTENSION + EXTENSION + + + FARM MANAGEMENT + FARM MANAGEMENT + + + FARMING SYSTEMS + FARMING SYSTEMS + + + FEEDS + FEEDS + + + FISH + FISH + + + FMD + FMD + + + FODDER + FODDER + + + FOOD SAFETY + FOOD SAFETY + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + FORAGES + FORAGES + + + FORESTRY + FORESTRY + + + GENETICS + GENETICS + + + GENETIC RESOURCES + GENETIC RESOURCES + + + GENDER + GENDER + + + GEODATA + GEODATA + + + GHG EMISSIONS + GHG EMISSIONS + + + GOATS + GOATS + + + HUMAN HEALTH + HUMAN HEALTH + + + HIV-AIDS + HIV-AIDS + + + HUMID TROPICS + HUMID TROPICS + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INDIGENOUS BREEDS + INDIGENOUS BREEDS + + + INNOVATION SYSTEMS + INNOVATION SYSTEMS + + + INTENSIFICATION + INTENSIFICATION + + + INSURANCE + INSURANCE + + + IRRIGATION + IRRIGATION + + + KNOWLEDGE AND INFORMATION + KNOWLEDGE AND INFORMATION + + + LIVELIHOODS + LIVELIHOODS + + + LEGUMES + LEGUMES + + + LIVESTOCK + LIVESTOCK + + + LIVESTOCK SYSTEMS + LIVESTOCK SYSTEMS + + + LIVESTOCK-WATER + LIVESTOCK-WATER + + + MARKETS + MARKETS + + + MEAT + MEAT + + + MERS + MERS + + + NRM + NRM + + + NUTRITION + NUTRITION + + + ONE HEALTH + ONE HEALTH + + + PARTICIPATION + PARTICIPATION + + + PASTORALISM + PASTORALISM + + + PESTS + PESTS + + + PIGS + PIGS + + + POULTRY + POULTRY + + + POLICY + POLICY + + + PPR + PPR + + + PRO-POOR LIVESTOCK + PRO-POOR LIVESTOCK + + + RANGELANDS + RANGELANDS + + + REPRODUCTION + REPRODUCTION + + + RESEARCH + RESEARCH + + + RESILIENCE + RESILIENCE + + + RVF + RVF + + + SCALING + SCALING + + + SEEDS + SEEDS + + + SHEEP + SHEEP + + + SMALL RUMINANTS + SMALL RUMINANTS + + + SOCIAL LEARNING + SOCIAL LEARNING + + + SOILS + SOILS + + + TRADE + TRADE + + + TRYPANOSOMIASIS + TRYPANOSOMIASIS + + + VACCINES + VACCINES + + + VALUE CHAINS + VALUE CHAINS + + + VULNERABILITY + VULNERABILITY + + + WATER + WATER + + + WILD MEAT + WILD MEAT + + + WILDLIFE + WILDLIFE + + + WILDLIFE CONSERVATION + WILDLIFE CONSERVATION + + + WOMEN + WOMEN + + + ZOONOTIC DISEASES + ZOONOTIC DISEASES + + + + + AGRICULTURE + AGRICULTURE + + + AGROFORESTRY + AGROFORESTRY + + + BANANA + BANANA + + + BEANS + BEANS + + + BIODIVERSITY + BIODIVERSITY + + + BIOFORTIFICATION + BIOFORTIFICATION + + + CACAO + CACAO + + + CASSAVA + CASSAVA + + + CAPACITY DEVELOPMENT + CAPACITY DEVELOPMENT + + + CLIMATE CHANGE + CLIMATE CHANGE + + + CLIMATE CHANGE ADAPTATION + CLIMATE CHANGE ADAPTATION + + + CLIMATE CHANGE MITIGATION + CLIMATE CHANGE MITIGATION + + + COCONUT + COCONUT + + + CONSERVATION AND USE + CONSERVATION AND USE + + + CROP PRODUCTION + CROP PRODUCTION + + + CROP WILD RELATIVES + CROP WILD RELATIVES + + + DOCUMENTATION + DOCUMENTATION + + + ECOSYSTEM SERVICES + ECOSYSTEM SERVICES + + + ECONOMICS + ECONOMICS + + + EXTENSION + EXTENSION + + + FARMING SYSTEMS + FARMING SYSTEMS + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + FORESTRY + FORESTRY + + + GENDER AND EQUITY + GENDER AND EQUITY + + + GENETIC RESOURCES + GENETIC RESOURCES + + + GERMPLASM CONSERVATION + GERMPLASM CONSERVATION + + + GOVERNANCE + GOVERNANCE + + + HEALTH + HEALTH + + + HOME GARDENS + HOME GARDENS + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INDIGENOUS KNOWLEDGE + INDIGENOUS KNOWLEDGE + + + INFORMATICS + INFORMATICS + + + INFORMATION SYSTEMS + INFORMATION SYSTEMS + + + KNOWLEDGE MANAGEMENT + KNOWLEDGE MANAGEMENT + + + LAND USE + LAND USE + + + LIVELIHOODS + LIVELIHOODS + + + LIVESTOCK + LIVESTOCK + + + MARKETS + MARKETS + + + MODELING + MODELING + + + MONITORING AND REPORTING + MONITORING AND REPORTING + + + NATURAL RESOURCE MANAGEMENT + NATURAL RESOURCE MANAGEMENT + + + NEGLECTED AND UNDERUTILIZED SPECIES + NEGLECTED AND UNDERUTILIZED SPECIES + + + NUTRITION + NUTRITION + + + PARTICIPATORY RESEARCH + PARTICIPATORY RESEARCH + + + PESTS AND DISEASES + PESTS AND DISEASES + + + PLANT BREEDING + PLANT BREEDING + + + PLANT GENETIC RESOURCES + PLANT GENETIC RESOURCES + + + POLICY + POLICY + + + RESILIENCE + RESILIENCE + + + RESTORATION + RESTORATION + + + RICE + RICE + + + RURAL COMMUNITIES + RURAL COMMUNITIES + + + SEED SYSTEMS + SEED SYSTEMS + + + SMALLHOLDER FARMERS + SMALLHOLDER FARMERS + + + SOIL HEALTH + SOIL HEALTH + + + SOIL INFORMATION + SOIL INFORMATION + + + SOIL LANDSCAPES + SOIL LANDSCAPES + + + STANDARDS + STANDARDS + + + SUSTAINABILITY + SUSTAINABILITY + + + TREE CROPS + TREE CROPS + + + TROPICAL FORAGES + TROPICAL FORAGES + + + VALUE CHAINS + VALUE CHAINS + + + WATER + WATER + + + + + AFLATOXIN + AFLATOXIN + + + AGRIBUSINESS + AGRIBUSINESS + + + AGRONOMY + AGRONOMY + + + BANANA + BANANA + + + BASELINE SURVEY + BASELINE SURVEY + + + BIODIVERSITY + BIODIVERSITY + + + BIOFORTIFICATION + BIOFORTIFICATION + + + BIOMETRICS + BIOMETRICS + + + BIOSCIENCE + BIOSCIENCE + + + CAPACITY DEVELOPMENT + CAPACITY DEVELOPMENT + + + CASSAVA + CASSAVA + + + CLIMATE CHANGE + CLIMATE CHANGE + + + COCOA + COCOA + + + COWPEA + COWPEA + + + CROP HUSBANDRY + CROP HUSBANDRY + + + CROP SYSTEMS + CROP SYSTEMS + + + DISEASE CONTROL + DISEASE CONTROL + + + DOMESTIC TRADE + DOMESTIC TRADE + + + FARM MANAGEMENT + FARM MANAGEMENT + + + FARMING SYSTEMS + FARMING SYSTEMS + + + FOOD SCIENCE + FOOD SCIENCE + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + FORESTRY + FORESTRY + + + GENDER + GENDER + + + GENETIC IMPROVEMENT + GENETIC IMPROVEMENT + + + GRAIN LEGUMES + GRAIN LEGUMES + + + HANDLING, TRANSPORT, STORAGE AND PROTECTION OF AGRICULTURAL PRODUCTS + HANDLING, TRANSPORT, STORAGE AND PROTECTION OF AGRICULTURAL PRODUCTS + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INTEGRATED SOIL FERTILITY MANAGEMENT + INTEGRATED SOIL FERTILITY MANAGEMENT + + + KNOWLEDGE MANAGEMENT + KNOWLEDGE MANAGEMENT + + + LAND USE + LAND USE + + + LIVELIHOODS + LIVELIHOODS + + + MAIZE + MAIZE + + + MARKETS + MARKETS + + + METEOROLOGY AND CLIMATOLOGY + METEOROLOGY AND CLIMATOLOGY + + + NATURAL RESOURCE MANAGEMENT + NATURAL RESOURCE MANAGEMENT + + + NUTRITION + NUTRITION + + + PESTS OF PLANTS + PESTS OF PLANTS + + + PLANT BREEDING + PLANT BREEDING + + + PLANT DISEASES + PLANT DISEASES + + + PLANT ECOLOGY + PLANT ECOLOGY + + + PLANT GENETIC RESOURCES + PLANT GENETIC RESOURCES + + + PLANT HEALTH + PLANT HEALTH + + + PLANT PRODUCTION + PLANT PRODUCTION + + + PLANTAIN + PLANTAIN + + + POLICIES AND INSTITUTIONS + POLICIES AND INSTITUTIONS + + + POST-HARVESTING TECHNOLOGY + POST-HARVESTING TECHNOLOGY + + + RESEARCH METHOD + RESEARCH METHOD + + + SMALLHOLDER FARMERS + SMALLHOLDER FARMERS + + + SOCIOECONOMY + SOCIOECONOMY + + + SOIL FERTILITY + SOIL FERTILITY + + + SOIL HEALTH + SOIL HEALTH + + + SOIL INFORMATION + SOIL INFORMATION + + + SOIL SURVEYS AND MAPPING + SOIL SURVEYS AND MAPPING + + + SOYBEAN + SOYBEAN + + + TISSUE CULTURE + TISSUE CULTURE + + + VALUE CHAINS + VALUE CHAINS + + + WEEDS + WEEDS + + + YAM + YAM + + + + + BIOMETRICS + BIOMETRICS + + + BIOTECH & PLANT BREEDING + BIOTECH & PLANT BREEDING + + + NATURAL RESOURCE MANAGEMENT + NATURAL RESOURCE MANAGEMENT + + + NUTRITION & HUMAN HEALTH + NUTRITION & HUMAN HEALTH + + + PLANT PRODUCTION & HEALTH + PLANT PRODUCTION & HEALTH + + + SOCIAL SCIENCE & AGRIBUSINESS + SOCIAL SCIENCE & AGRICUSINESS + + + + + N/A + + + + Open Access + Open Access + + + Limited Access + Limited Access + + + + + Choose One + + + + Creative Commons Attribution 4.0 (CC BY 4.0) + CC-BY-4.0 + + + Creative Commons Attribution-ShareAlike 4.0 (CC BY-SA 4.0) + CC-BY-SA-4.0 + + + Creative Commons Attribution-NoDerivatives 4.0 (CC BY-ND 4.0) + CC-BY-ND-4.0 + + + Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0) + CC-BY-NC-4.0 + + + Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0) + CC-BY-NC-SA-4.0 + + + Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 (CC BY-NC-ND 4.0) + CC-BY-NC-ND-4.0 + + + Creative Commons Attribution 3.0 (CC BY 3.0) + CC-BY-3.0 + + + Creative Commons Attribution-ShareAlike 3.0 (CC BY-SA 3.0) + CC-BY-SA-3.0 + + + Creative Commons Attribution-NoDerivs 3.0 (CC BY-ND 3.0) + CC-BY-ND-3.0 + + + Creative Commons Attribution-NonCommercial 3.0 (CC BY-NC 3.0) + CC-BY-NC-3.0 + + + Creative Commons Attribution-NonCommercial-ShareAlike 3.0 (CC BY-NC-SA 3.0) + CC-BY-NC-SA-3.0 + + + Creative Commons Attribution-NonCommercial-NoDerivs 3.0 (CC BY-NC-ND 3.0) + CC-BY-NC-ND-3.0 + + + Creative Commons Attribution 3.0 IGO (CC BY 3.0 IGO) + CC-BY-3.0-IGO + + + Creative Commons Attribution-ShareAlike 3.0 IGO (CC BY-SA 3.0 IGO) + CC-BY-SA-3.0-IGO + + + Creative Commons Attribution-NoDerivs 3.0 IGO (CC BY-ND 3.0 IGO) + CC-BY-ND-3.0-IGO + + + Creative Commons Attribution-NonCommercial 3.0 IGO (CC BY-NC 3.0 IGO) + CC-BY-NC-3.0-IGO + + + Creative Commons Attribution-NonCommercial-ShareAlike 3.0 IGO (CC BY-NC-SA 3.0 IGO) + CC-BY-NC-SA-3.0-IGO + + + Creative Commons Attribution-NonCommercial-NoDerivs 3.0 IGO (CC BY-NC-ND 3.0 IGO) + CC-BY-NC-ND-3.0-IGO + + + + Creative Commons Zero Public Domain Dedication 1.0 (CC0 1.0) + CC0-1.0 + + + Creative Commons Attribution No Version (CC BY) + CC-BY + + + Creative Commons Attribution-ShareAlike No Version (CC BY-SA) + CC-BY-SA + + + Creative Commons Attribution-NoDerivatives No Version (CC BY-ND) + CC-BY-ND + + + Creative Commons Attribution-NonCommercial No Version (CC BY-NC) + CC-BY-NC + + + Creative Commons Attribution-NonCommercial-ShareAlike No Version (CC BY-NC-SA) + CC-BY-NC-SA + + + Creative Commons Attribution-NonCommercial-NoDerivatives No Version (CC BY-NC-ND) + CC-BY-NC-ND + + + Open Government Licence v3.0 (OGL-UK-3.0) + OGL-UK-3.0 + + + GNU General Public License v3.0 (GPL-3.0-only) + GPL-3.0-only + + + MIT License + MIT + + + Copyrighted; all rights reserved + Copyrighted; all rights reserved + + + Copyrighted; Non-commercial educational use only + Copyrighted; Non-commercial educational use only + + + Copyrighted; Non-commercial use only + Copyrighted; Non-commercial use only + + + All rights reserved; self-archive copy only + All rights reserved; self-archive copy only + + + All rights reserved; no re-use allowed + All rights reserved; no re-use allowed + + + Other + Other + + + + + Non-ISI Journal + + + + ISI Journal + ISI Journal + + + + + N/A + + + + Grey Literature + Grey Literature + + + Formally Published + Formally Published + + + + + Academics + Academics + + + CGIAR + CGIAR + + + Development Practitioners + Development Practitioners + + + Donors + Donors + + + Extension + Extension + + + Farmers + Farmers + + + General Public + General Public + + + NGOs + NGOs + + + Policy Makers + Policy Makers + + + Scientists + Scientists + + + From 269f6971fc67d61130821c12161c255fb085f660 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 14 Jun 2023 08:47:54 +0300 Subject: [PATCH 023/119] dspace/config: enable Sherpa Romeo summary for ISSNs Just to show that it's possible and get feedback from our submitters. --- dspace/config/item-submission.xml | 2 +- dspace/config/spring/api/sherpa.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/item-submission.xml b/dspace/config/item-submission.xml index f5fcef7266d9..6abdc56e230c 100644 --- a/dspace/config/item-submission.xml +++ b/dspace/config/item-submission.xml @@ -272,7 +272,7 @@ - + diff --git a/dspace/config/spring/api/sherpa.xml b/dspace/config/spring/api/sherpa.xml index 0414f3f8e4b4..7c7c657d6ebc 100644 --- a/dspace/config/spring/api/sherpa.xml +++ b/dspace/config/spring/api/sherpa.xml @@ -17,7 +17,7 @@ - dc.identifier.issn + cg.issn From b0cd680d3650885298fe97cd9bb778adc49af272 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 16 Jun 2023 13:11:59 +0300 Subject: [PATCH 024/119] dspace.cfg: enable item counts aka "strengths" --- dspace/config/dspace.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 6fea2e3dbeb2..bcab020d789a 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1075,7 +1075,7 @@ webui.preview.brand.fontpoint = 12 # Whether to display collection and community strengths (i.e. item counts) # By default, this feature is disabled. -# webui.strengths.show = false +webui.strengths.show = true # Counts fetched in real time will perform an actual count of the # index contents every time a page with this feature is requested, From 3600736d400d5f423fae4fe65a1ca14e32b59756 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 16 Jun 2023 13:17:17 +0300 Subject: [PATCH 025/119] =?UTF-8?q?dspace/config:=20dc.date.issued=20?= =?UTF-8?q?=E2=86=92=20dcterms.issued?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update a few Discovery configs to use dcterms.issued instead of dc.date.issued. --- dspace/config/modules/discovery.cfg | 2 +- dspace/config/spring/api/discovery.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dspace/config/modules/discovery.cfg b/dspace/config/modules/discovery.cfg index 72088ddc49fa..4357980f87ba 100644 --- a/dspace/config/modules/discovery.cfg +++ b/dspace/config/modules/discovery.cfg @@ -22,7 +22,7 @@ discovery.search.server = ${solr.server}/${solr.multicorePrefix}search # discovery.index.ignore-variants = false # discovery.index.ignore-authority = false -discovery.index.projection=dc.title,dc.contributor.*,dc.date.issued +discovery.index.projection=dc.title,dc.contributor.*,dcterms.issued # Allow auto-reindexing. # If any database migrations are applied to your database (via Flyway), then a diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index c1fdc1351c0c..bb338cf16037 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -3021,7 +3021,7 @@
- + @@ -3096,7 +3096,7 @@
- + From 228d66b0414178b42e6f2c80b0addade0f1de431 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 21 Jun 2023 14:19:59 +0300 Subject: [PATCH 026/119] dspace/config: enable LDAP authentication I was surprised to see this was disabled, as I already tested it a few months ago. --- dspace/config/modules/authentication-ldap.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/modules/authentication-ldap.cfg b/dspace/config/modules/authentication-ldap.cfg index 422f2f57e132..8b6416bf4de9 100644 --- a/dspace/config/modules/authentication-ldap.cfg +++ b/dspace/config/modules/authentication-ldap.cfg @@ -27,7 +27,7 @@ # With the setting off, users will be required to register and login with # their email address. With this setting on, users will be able to login # and register with their LDAP user ids and passwords. -authentication-ldap.enable = false +authentication-ldap.enable = true ##### LDAP AutoRegister Settings ##### From 95fdbb13d7da1557e9b17ce8188fcebddc711b97 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 21 Jun 2023 14:36:12 +0300 Subject: [PATCH 027/119] dspace/config: use OU=ILRIHUB in ldap group map The DC=ILRI no longer exists in Active Directory as far as I can see from checking a few users. --- dspace/config/modules/authentication-ldap.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dspace/config/modules/authentication-ldap.cfg b/dspace/config/modules/authentication-ldap.cfg index 8b6416bf4de9..b2916b88e396 100644 --- a/dspace/config/modules/authentication-ldap.cfg +++ b/dspace/config/modules/authentication-ldap.cfg @@ -149,8 +149,7 @@ authentication-ldap.search_scope = 2 # in user's full DN. If it's found, assign user to the DSpace group # specified by the right part of the groupmap value (after the ":"). # One user may belong to multiple groups. -authentication-ldap.login.groupmap.1 = DC=ILRI:ILRI_LDAP_USERS -authentication-ldap.login.groupmap.2 = OU=ILRIHUB:ILRI_LDAP_USERS +authentication-ldap.login.groupmap.1 = OU=ILRIHUB:ILRI_LDAP_USERS #authentication-ldap.login.groupmap.3 = ou=ldap-dept3:dspace-groupA # If this property is uncommented, it changes the meaning of the left part of From f91bc4250cd7488000c3532d503bcb18f0ea7b59 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 21 Jun 2023 14:38:25 +0300 Subject: [PATCH 028/119] dspace/config: disable ldap groupmap There seems to be a bug in DSpace 7.6-SNAPSHOT that causes LDAP lo- gins to fail when the groupmap is enabled. --- dspace/config/modules/authentication-ldap.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/modules/authentication-ldap.cfg b/dspace/config/modules/authentication-ldap.cfg index b2916b88e396..50ae87f98809 100644 --- a/dspace/config/modules/authentication-ldap.cfg +++ b/dspace/config/modules/authentication-ldap.cfg @@ -149,7 +149,7 @@ authentication-ldap.search_scope = 2 # in user's full DN. If it's found, assign user to the DSpace group # specified by the right part of the groupmap value (after the ":"). # One user may belong to multiple groups. -authentication-ldap.login.groupmap.1 = OU=ILRIHUB:ILRI_LDAP_USERS +#authentication-ldap.login.groupmap.1 = OU=ILRIHUB:ILRI_LDAP_USERS #authentication-ldap.login.groupmap.3 = ou=ldap-dept3:dspace-groupA # If this property is uncommented, it changes the meaning of the left part of From ed37bb419f21d5313412991f84e913800aa31ae6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 21 Jun 2023 16:52:50 +0300 Subject: [PATCH 029/119] dspace/config: update abstract in discovery.xml --- dspace/config/spring/api/discovery.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index bb338cf16037..c7e37fcbf17b 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -256,7 +256,7 @@
- + From 02b1cae566befb0d2609b80e965d21351d65aa7f Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 27 Jun 2023 15:58:10 +0300 Subject: [PATCH 030/119] dspace/config: remove subregion value pairs from submission form We are using an external controlled vocabulary instead. This hasn't been needed since we got rid of Atmire's Listings and Reports module in DSpace 6. --- dspace/config/submission-forms.xml | 326 ----------------------------- 1 file changed, 326 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index f531bbedd82c..49488049c7da 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -3264,332 +3264,6 @@ Western Europe - - - Alberta - Alberta - - - An Giang - An Giang - - - Andhra Pradesh - Andhra Pradesh - - - Ashanti - Ashanti - - - Assam - Assam - - - Badulla - Badulla - - - Bali - Bali - - - Balochistan - Balochistan - - - Bamako - Bamako - - - Bihar - Bihar - - - British Columbia - British Columbia - - - Cajamarca - Cajamarca - - - California - California - - - Cauca - Cauca - - - Chhattisgarh - Chhattisgarh - - - Dakar - Dakar - - - Delaware - Delaware - - - Florida - Florida - - - Gampaha - Gampaha - - - Gandaki - Gandaki - - - Guanajuato - Guanajuato - - - Gujarat - Gujarat - - - Haryana - Haryana - - - Helmand - Helmand - - - Himachal Pradesh - Himachal Pradesh - - - Houaphan - Houaphan - - - Iowa - Iowa - - - Iringa - Iringa - - - Jambi - Jambi - - - Jharkhand - Jharkhand - - - Kaabong - Kaabong - - - kalimantan - kalimantan - - - Kampala - Kampala - - - Kandal - Kandal - - - Karnali - Karnali - - - Karnataka - Karnataka - - - Kasungu - Kasungu - - - Kerala - Kerala - - - Khon Kaen - Khon Kaen - - - Kunduz - Kunduz - - - Kwazulu-Natal - Kwazulu-Natal - - - Lima - Lima - - - Limpopo - Limpopo - - - Madhya Pradesh - Madhya Pradesh - - - Maharashtra - Maharashtra - - - Maputo - Maputo - - - Masvingo - Masvingo - - - Meghalaya - Meghalaya - - - Mendoza - Mendoza - - - Missouri - Missouri - - - Nabeul - Nabeul - - - Nagaland - Nagaland - - - Nebraska - Nebraska - - - New South Wales - New South Wales - - - New York - New York - - - Odisha - Odisha - - - Osh - Osh - - - Pando - Pando - - - Punjab - Punjab - - - Queensland - Queensland - - - Rajasthan - Rajasthan - - - Rajshahi - Rajshahi - - - Riau - Riau - - - Sabah - Sabah - - - Sahel - Sahel - - - Salyan - Salyan - - - Sindh - Sindh - - - South Australia - South Australia - - - Sulawesi - Sulawesi - - - Tamil Nadu - Tamil Nadu - - - Telangana - Telangana - - - Texas - Texas - - - Tigray - Tigray - - - Turkana - Turkana - - - Uttar Pradesh - Uttar Pradesh - - - Uttarakhand - Uttarakhand - - - Victoria - Victoria - - - Washington - Washington - - - West Bengal - West Bengal - - - Western Cape - Western Cape - - - Yogyakarta - Yogyakarta - - Afghanistan From d59a5a32ac73d984c88da9bbd3eaebee2a0b4dd6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 27 Jun 2023 16:00:32 +0300 Subject: [PATCH 031/119] dspace/config: update discovery.xml Change a few more references from dc.subject to dcterms.subject. --- dspace/config/spring/api/discovery.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index c7e37fcbf17b..8693b2ff6600 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -303,7 +303,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject
@@ -440,7 +440,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject @@ -579,7 +579,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject @@ -719,7 +719,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject From 8d508f6c6815ebbd10620afe39de9e8a977699b4 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 28 Jun 2023 10:28:56 +0300 Subject: [PATCH 032/119] dspace/config: update discovery.xml Replace a few more references to dc.description.abstract, since we are using dcterms.abstract. --- dspace/config/spring/api/discovery.xml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 8693b2ff6600..06d2bb17e482 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -405,7 +405,7 @@ - + @@ -544,7 +544,7 @@ - + @@ -684,7 +684,7 @@ - + @@ -794,7 +794,7 @@ - + @@ -869,7 +869,7 @@ - + @@ -945,7 +945,7 @@ - + @@ -1019,7 +1019,7 @@ - + @@ -1094,7 +1094,7 @@ - + From e902e7a3d0f8369f05c1686016314af6f5a5a081 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 29 Jun 2023 14:25:55 +0300 Subject: [PATCH 033/119] dspace/config: remove srsc from submission-forms.xml Remove this so that DSpace doesn't automatically try to create a browse option for it. This is a side effect of a new browse feature in DSpace 7.6, where vocabularies in the submission form are auto- matically configured as browses. --- dspace/config/submission-forms.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 49488049c7da..fd86f6e18b6f 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -1575,7 +1575,6 @@ tag Enter appropriate subject keywords or phrases. - srsc From 1a03ab87040809ddeabda0675ae9aa6a1806ae28 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 26 Jul 2023 17:08:53 +0300 Subject: [PATCH 034/119] dspace.cfg: enable linked browse for some fields The implementation is awkward, but seems to work if we list all our fields one by one. See: https://wiki.lyrasis.org/display/DSDOC7x/Configuration+Reference --- dspace/config/dspace.cfg | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index bcab020d789a..d5df33557340 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1165,6 +1165,10 @@ webui.browse.index.1 = dateissued:item:dateissued webui.browse.index.2 = author:metadata:dc.contributor.*\,dc.creator:text webui.browse.index.3 = title:item:title webui.browse.index.4 = subject:metadata:dc.subject.*\,dcterms.subject:text +webui.browse.index.5 = region:metadata:cg.coverage.region:text +webui.browse.index.6 = country:metadata:cg.coverage.country:text +webui.browse.index.7 = subregion:metadata:cg.coverage.subregion:text +webui.browse.index.8 = itemtype:metadata:dcterms.type:text ## example of authority-controlled browse category - see authority control config #webui.browse.index.5 = lcAuthor:metadataAuthority:dc.contributor.author:authority @@ -1246,6 +1250,11 @@ plugin.named.org.dspace.sort.OrderFormatDelegate= \ # # The default below defines the authors to link to other publications by that author webui.browse.link.1 = author:dc.contributor.* +webui.browse.link.2 = subject:dcterms.subject +webui.browse.link.3 = region:cg.coverage.region +webui.browse.link.4 = country:cg.coverage.country +webui.browse.link.5 = subregion:cg.coverage.subregion +webui.browse.link.6 = itemtype:dcterms.type #### Display browse frequencies # @@ -1472,7 +1481,7 @@ log.report.dir = ${dspace.dir}/log # You can add more than one 'mark_[value]' options (with different value) in case you need to mark items more than one time for # different purposes. Remember to add the respective beans in file 'config/spring/api/item-marking.xml'. # -webui.itemlist.columns = dcterms.accessRights,dcterms.issued(date),dcterms.type,dc.title,dc.contributor.* +webui.itemlist.columns = dcterms.accessRights,dcterms.issued(date),dcterms.type,dc.title,dc.contributor.*,cg.coverage.country,cg.coverage.region,cg.coverage.subregion # # Additionally, you can override the DC fields used on the listing page for # a given browse index and/or sort option. As a sort option or index may be defined From 1f333308ea5433e91d2f0793684707497b62c976 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 14 Aug 2023 17:52:05 +0200 Subject: [PATCH 035/119] dspace-rest: allow bundleName parameter in bitstream endpoint Port of DSpace 6.x patch to legacy REST API in DSpace 7. See: https://github.com/DSpace/DSpace/pull/8343 --- .../java/org/dspace/rest/ItemsResource.java | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java b/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java index 615aacac21cc..7c7dbc458c4d 100644 --- a/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java +++ b/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java @@ -443,7 +443,7 @@ public Response addItemMetadata(@PathParam("item_id") String itemId, @POST @Path("/{item_id}/bitstreams") public Bitstream addItemBitstream(@PathParam("item_id") String itemId, InputStream inputStream, - @QueryParam("name") String name, @QueryParam("description") String description, + @QueryParam("name") String name, @QueryParam("description") String description, @QueryParam("bundleName") String bundleName, @QueryParam("groupId") String groupId, @QueryParam("year") Integer year, @QueryParam("month") Integer month, @QueryParam("day") Integer day, @QueryParam("userIP") String user_ip, @@ -467,15 +467,29 @@ public Bitstream addItemBitstream(@PathParam("item_id") String itemId, InputStre log.trace("Creating bitstream in item."); org.dspace.content.Bundle bundle = null; org.dspace.content.Bitstream dspaceBitstream = null; - List bundles = itemService.getBundles(dspaceItem, org.dspace.core.Constants.CONTENT_BUNDLE_NAME); + List bundles = dspaceItem.getBundles(); + + // add bitstream to specified bundle + if (bundleName == null) { + bundleName = "ORIGINAL"; + } + for (Bundle existingBundle : bundles) + { + if (existingBundle.getName().equals(bundleName)) + { + bundle = existingBundle; + break; + } + } if (bundles != null && bundles.size() != 0) { bundle = bundles.get(0); // There should be only one bundle ORIGINAL. } if (bundle == null) { - log.trace("Creating bundle in item."); - dspaceBitstream = itemService.createSingleBitstream(context, inputStream, dspaceItem); + log.trace("Creating bundle "+bundleName+" in item."); + dspaceBitstream = itemService.createSingleBitstream(context, inputStream, dspaceItem, bundleName); } else { + log.trace("Getting bundle from item."); dspaceBitstream = bitstreamService.create(context, bundle, inputStream); } From 2316be2793340ba6502fab4aef7181dd5c37f8b3 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 30 Aug 2023 22:32:13 +0300 Subject: [PATCH 036/119] discovery.xml: move subject facet up --- dspace/config/spring/api/discovery.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 06d2bb17e482..75ee1e28639a 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -168,6 +168,7 @@ + @@ -190,11 +191,11 @@ + - From 6924ae6b7228669e948762272a2d6c4ebb05aa2c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 31 Aug 2023 11:29:20 +0300 Subject: [PATCH 037/119] discovery.xml: add publisher to search facets --- dspace/config/spring/api/discovery.xml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 75ee1e28639a..4ca0714e65bd 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -166,6 +166,7 @@ + @@ -189,6 +190,7 @@ + @@ -3011,6 +3013,18 @@ + + + + + dcterms.publisher + + + + + + + From f613739900e4216074d86110bcfab693b0a76677 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 4 Sep 2023 19:38:11 +0300 Subject: [PATCH 038/119] discovery.xml: Update indexes We should use the same capitalization that we use elsewhere so it is more predictable and consistent. --- dspace/config/spring/api/discovery.xml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 4ca0714e65bd..94f8994a3101 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -2990,7 +2990,7 @@ - + cg.subject.impactArea @@ -3002,7 +3002,7 @@ - + cg.subject.actionArea @@ -3025,6 +3025,18 @@ + + + + + cg.subject.impactPlatform + + + + + + + From e3aa943362cf9d352621e5a995b69f865e49cd13 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Sep 2023 17:02:36 +0300 Subject: [PATCH 039/119] submission-forms.xml: use input type tag for more fields For any fields that have more than a dozen or so values and don't need free text, we can use the tag input type. --- dspace/config/submission-forms.xml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index fd86f6e18b6f..6e20941d0927 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -364,7 +364,7 @@ true Indicate the main audience for whom the item is produced. - dropdown + tag @@ -434,7 +434,7 @@ true - dropdown + tag Select the language of the main content of the item. If the language does not appear in the list, please select 'Other'. If the content does not really have a language (for example, if it is a dataset or an image) please leave this blank. @@ -454,7 +454,7 @@ initiative true - dropdown + tag Select any CGIAR Initiatives(s) associated with this item. Use this to show that an Initiative funded this item. @@ -466,7 +466,7 @@ crp true - dropdown + tag Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. @@ -478,7 +478,7 @@ sdg true - dropdown + tag Select any UN Sustainable Development Goals associated with this item. @@ -512,7 +512,7 @@ country true - dropdown + tag Select a country or countries within the scope of the item. @@ -606,7 +606,7 @@ alliancebiovciat true - dropdown + tag @@ -618,7 +618,7 @@ cip true - dropdown + tag @@ -630,7 +630,7 @@ ilri true - dropdown + tag From 98f124fb1c031978aca561d25d708c5612683fc1 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Sep 2023 17:04:36 +0300 Subject: [PATCH 040/119] submission-forms.xml: move CRPs down --- dspace/config/submission-forms.xml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 6e20941d0927..e946fad53021 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -459,18 +459,6 @@ - - - cg - contributor - crp - true - - tag - Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. - - - cg @@ -505,6 +493,18 @@ + + + cg + contributor + crp + true + + tag + Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. + + + cg From 9b8cacd721dec778ae3d3edd1df7f75cb2a69104 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Sep 2023 17:05:14 +0300 Subject: [PATCH 041/119] submission-forms.xml: move SDGs down --- dspace/config/submission-forms.xml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index e946fad53021..78b977a7b106 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -459,18 +459,6 @@ - - - cg - subject - sdg - true - - tag - Select any UN Sustainable Development Goals associated with this item. - - - cg @@ -493,6 +481,18 @@ + + + cg + subject + sdg + true + + tag + Select any UN Sustainable Development Goals associated with this item. + + + cg From 2d2f5c21d146e030bd1dec05d7e358f0d59e46f6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Sep 2023 17:10:41 +0300 Subject: [PATCH 042/119] submission-forms.xml: add CGIAR Impact Platforms --- dspace/config/submission-forms.xml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 78b977a7b106..ef8afa2acdd6 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -459,6 +459,18 @@ + + + cg + subject + impactPlatform + true + + dropdown + Select any CGIAR Impact Platforms associated with this item. + + + cg @@ -3085,6 +3097,16 @@ SDG 17 - Partnerships for the goals + + + Climate Change + Climate Change + + + Gender + Gender + + Climate adaptation and mitigation From 0b0b232a92f330e084a2b7791e33cb45dca172ae Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Sep 2023 17:27:53 +0300 Subject: [PATCH 043/119] submission-forms.xml: indicate CRPs are 2012-2021 --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index ef8afa2acdd6..f57be387cb33 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -511,7 +511,7 @@ contributor crp true - + tag Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. From 213779006357c9a907682622a410de4edd688963 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sat, 9 Sep 2023 00:22:31 +0300 Subject: [PATCH 044/119] dspace.cfg: disable automatic browses for some fields --- dspace/config/dspace.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index d5df33557340..b1dc31b27887 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1177,7 +1177,8 @@ webui.browse.index.8 = itemtype:metadata:dcterms.type:text # vocabularies in the submission forms. These could be disabled adding the name of # the vocabularies to exclude in this comma-separated property. # (Requires reboot of servlet container, e.g. Tomcat, to reload) -# webui.browse.vocabularies.disabled = srsc +webui.browse.vocabularies.disabled = srsc, dcterms-subject, dc-contributor-author, cg-contributor-donor, cg-contributor-affiliation + # Enable/Disable tag cloud in browsing. # webui.browse.index.tagcloud. = true | false From 5491708894d9fb6277b2e4a54123fee240f44aeb Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 10 Sep 2023 12:24:02 +0300 Subject: [PATCH 045/119] submission-forms.xml: revert some changes to tags The tag input type is currently buggy, and in any case we should only be using it for lists that have tens or more values. --- dspace/config/submission-forms.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index f57be387cb33..efeb70cb2e9e 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -364,7 +364,7 @@ true Indicate the main audience for whom the item is produced. - tag + dropdown @@ -500,7 +500,7 @@ sdg true - tag + dropdown Select any UN Sustainable Development Goals associated with this item. @@ -512,7 +512,7 @@ crp true - tag + dropdown Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. From 68234bd0ef5e4ffa9b233b8a953f18b6e930d7e2 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 10 Sep 2023 21:42:26 +0300 Subject: [PATCH 046/119] submission-forms.xml: don't use tag input type It's too buggy. --- dspace/config/submission-forms.xml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index efeb70cb2e9e..bf501341d571 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -434,7 +434,7 @@ true - tag + dropdown Select the language of the main content of the item. If the language does not appear in the list, please select 'Other'. If the content does not really have a language (for example, if it is a dataset or an image) please leave this blank. @@ -454,7 +454,7 @@ initiative true - tag + dropdown Select any CGIAR Initiatives(s) associated with this item. Use this to show that an Initiative funded this item. @@ -524,7 +524,7 @@ country true - tag + dropdown Select a country or countries within the scope of the item. @@ -571,7 +571,7 @@ true - tag + dropdown Enter AGROVOC subjects in lower case. dcterms-subject @@ -618,7 +618,7 @@ alliancebiovciat true - tag + dropdown @@ -630,7 +630,7 @@ cip true - tag + dropdown @@ -642,7 +642,7 @@ ilri true - tag + dropdown From 520bb03598509bc14156b4b6aa6891da2c9d29d7 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 11 Sep 2023 09:29:48 +0300 Subject: [PATCH 047/119] submission-forms.xml: fix AGROVOC field Fields using external vocabularies must be onebox. --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index bf501341d571..7732e472ee96 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -571,7 +571,7 @@ true - dropdown + onebox Enter AGROVOC subjects in lower case. dcterms-subject From d288fef36630fe20bf1e22b8a871a420b573e77d Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 11 Sep 2023 16:40:52 +0300 Subject: [PATCH 048/119] discovery.xml: re-work facets Re-order and add CGIAR Impact Platform. --- dspace/config/spring/api/discovery.xml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index 94f8994a3101..2099f3fca611 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -166,16 +166,17 @@ - + + - + + + - - @@ -192,6 +193,7 @@ + From 71cfe20719895eacaa6137377805fbad6ff7b9d7 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 12 Sep 2023 20:23:03 +0300 Subject: [PATCH 049/119] dspace/config: remove browse links I'm not using these anymore, as I've worked around them by creating a custom component in Angular that links to the Discovery search. --- dspace/config/dspace.cfg | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index b1dc31b27887..c2b57d7c96c0 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1252,10 +1252,6 @@ plugin.named.org.dspace.sort.OrderFormatDelegate= \ # The default below defines the authors to link to other publications by that author webui.browse.link.1 = author:dc.contributor.* webui.browse.link.2 = subject:dcterms.subject -webui.browse.link.3 = region:cg.coverage.region -webui.browse.link.4 = country:cg.coverage.country -webui.browse.link.5 = subregion:cg.coverage.subregion -webui.browse.link.6 = itemtype:dcterms.type #### Display browse frequencies # From 79cb14b9a63cc0a09792d1318ec7435150928b3e Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 18 Sep 2023 13:04:21 +0300 Subject: [PATCH 050/119] dspace.cfg: add Google Analytics key --- dspace/config/dspace.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index c2b57d7c96c0..8f75b32cb564 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1511,7 +1511,7 @@ webui.itemlist.columns = dcterms.accessRights,dcterms.issued(date),dcterms.type, # inside that snipet is your Google Analytics key usually found in this line: # _uacct = "UA-XXXXXXX-X" # Take this key (just the UA-XXXXXX-X part) and place it here in this parameter. -# google.analytics.key=UA-XXXXXX-X +google.analytics.key=UA-10691096-8 # The max number of events held in the GA buffer (default: 256) # google.analytics.buffer.limit=256 From 4b26abca0b6438dc45c8ff8fcb5094d958bf9ea8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 21 Sep 2023 10:53:51 +0300 Subject: [PATCH 051/119] submission-forms.xml: update CGIAR Impact Platforms --- dspace/config/submission-forms.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 7732e472ee96..84f123bc721a 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -3102,10 +3102,22 @@ Climate Change Climate Change + + Environmental Health and Biodiversity + Environmental Health and Biodiversity + Gender Gender + + Nutrition, Health and Food Security + Nutrition, Health and Food Security + + + Poverty Reduction, Livelihoods and Jobs + Poverty Reduction, Livelihoods and Jobs + From 14cab6edf1e29ed397e5230a46bffc6cbbfbd5f7 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 25 Sep 2023 15:13:03 +0300 Subject: [PATCH 052/119] dspace/config/modules: disable automatic Discovery reindexing I always run this manually after doing migrations. --- dspace/config/modules/discovery.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/modules/discovery.cfg b/dspace/config/modules/discovery.cfg index 4357980f87ba..7699a34e313a 100644 --- a/dspace/config/modules/discovery.cfg +++ b/dspace/config/modules/discovery.cfg @@ -31,7 +31,7 @@ discovery.index.projection=dc.title,dc.contributor.*,dcterms.issued # property is enabled AND that such a file exists. If the two conditions are # satisfied, a background reindex of all content is triggered in Discovery. # Defaults to true: auto-reindexing is enabled. -#discovery.autoReindex = true +discovery.autoReindex = false # Value used for the namedresourcetype facet used by the mydspace # \n|||\n### From 208eef90276aa4e1243d471874383629dcf49770 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 3 Oct 2023 15:11:45 +0300 Subject: [PATCH 053/119] dspace: disable versioning We were not using this in DSpace previously, and DSpace 7 has now changed the default to enabled. --- dspace/config/dspace.cfg | 2 +- dspace/config/modules/versioning.cfg | 2 +- dspace/config/spring/api/identifier-service.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 8f75b32cb564..ce8e2dfeeefa 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -785,7 +785,7 @@ event.dispatcher.default.class = org.dspace.event.BasicDispatcher # Add rdf here, if you are using dspace-rdf to export your repository content as RDF. # Add iiif here, if you are using dspace-iiif. # Add orcidqueue here, if the integration with ORCID is configured and wish to enable the synchronization queue functionality -event.dispatcher.default.consumers = versioning, discovery, eperson, submissionconfig +event.dispatcher.default.consumers = discovery, eperson, submissionconfig # The noindex dispatcher will not create search or browse indexes (useful for batch item imports) event.dispatcher.noindex.class = org.dspace.event.BasicDispatcher diff --git a/dspace/config/modules/versioning.cfg b/dspace/config/modules/versioning.cfg index 1690ceac4cd2..9f8540c00edf 100644 --- a/dspace/config/modules/versioning.cfg +++ b/dspace/config/modules/versioning.cfg @@ -5,7 +5,7 @@ #---------------------------------------------------# # The property versioning.enabled is used to enabled/disable versioning in DSpace, # the default value is true if it unset -# versioning.enabled = true +versioning.enabled = false # Control if the history overview of an item should only be shown to administrators # If enabled only the administrators for the item will be able to view the versioning history diff --git a/dspace/config/spring/api/identifier-service.xml b/dspace/config/spring/api/identifier-service.xml index 79e19e879e8e..578ca9c00401 100644 --- a/dspace/config/spring/api/identifier-service.xml +++ b/dspace/config/spring/api/identifier-service.xml @@ -24,7 +24,7 @@ The VersionedHandleIdentifierProvider creates a new versioned handle for every new version. --> - + + Author + aut + + + + + + + + + + Funder + fnd + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + + + + + + + + + + + + + + <xsl:value-of select="doc:metadata/doc:element[@name='cg']/doc:element[@name='journal']/doc:element/doc:field[@name='value']/text()"></xsl:value-of> + + + + + + + + + + + + + + + + + no. + + + + + + + + + + + + + + + + + + - - - - - - + + + + + - - + + + - + - + + + + + + + - - + + + + + - - - - + + + + - - + + + + + + + + - - + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + <xsl:value-of select="." /> + + + + + + + <xsl:value-of select="." /> + - - + + + - + diff --git a/dspace/config/crosswalks/oai/xoai.xml b/dspace/config/crosswalks/oai/xoai.xml index e843814dd7b8..cff70b03a048 100644 --- a/dspace/config/crosswalks/oai/xoai.xml +++ b/dspace/config/crosswalks/oai/xoai.xml @@ -91,6 +91,25 @@ + + + + + + + + + + + + + + + + + This contexts complies with AGRIS Guidelines. + + @@ -149,7 +168,7 @@ mods metadataFormats/mods.xsl http://www.loc.gov/mods/v3 - http://www.loc.gov/standards/mods/v3/mods-3-1.xsd + https://www.loc.gov/standards/mods/v3/mods-3-7.xsd qdc @@ -399,6 +418,30 @@ + + + + + + + + + + + + + + + + + + + + + @@ -516,6 +559,30 @@ + + + org.dspace.xoai.filter.DSpaceAtLeastOneMetadataFilter + + dcterms.type + equal + + Book + Book Chapter + Brief + Conference Paper + Conference Proceedings + Dataset + Journal Article + Manual + Report + Thesis + Training Material + Working Paper + + + + @@ -524,6 +591,11 @@ Open Access DRIVERset + + agris + AGRIS + + openaire OpenAIRE From f0c509903677f58f648deb47e3a08da802d4878c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 17 Oct 2023 09:17:28 +0300 Subject: [PATCH 057/119] dspace/config/crosswalks: update other crosswalks These crosswalks are used to insert metadata into item pages for search bots like Google to consume. I am not sure whether the XHTML crosswalk is actually being used anymore... --- .../crosswalks/google-metadata.properties | 30 +++++++++---------- .../crosswalks/xhtml-head-item.properties | 25 +++++++++------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/dspace/config/crosswalks/google-metadata.properties b/dspace/config/crosswalks/google-metadata.properties index 157ee9c0b13c..60843c2eae77 100644 --- a/dspace/config/crosswalks/google-metadata.properties +++ b/dspace/config/crosswalks/google-metadata.properties @@ -11,9 +11,9 @@ # e.g. a dissertation item that contains values for the # dissertation-specific metadata elements. -google.identifiers.dissertation = dc.type:Thesis -google.identifiers.patent = dc.type:Patent -google.identifiers.technical_report = dc.type:Technical Report +google.identifiers.dissertation = dcterms.type:Thesis +google.identifiers.patent = dcterms.type:Patent +google.identifiers.technical_report = dcterms.type:Technical Report # Field Mappings @@ -37,31 +37,31 @@ google.identifiers.technical_report = dc.type:Technical Report # "$simple-pdf" inserts the full URL to the bitstream when there is only one and it is a PDF google.citation_title = dc.title -google.citation_publisher = dc.publisher +google.citation_publisher = dcterms.publisher google.citation_author = dc.author | dc.contributor.author | dc.creator -google.citation_date = dc.date.issued -google.citation_language = dc.language.iso +google.citation_date = dc.date.copyright | dcterms.issued | dc.date.available | dc.date.accessioned +google.citation_language = dcterms.language google.citation_pmid = google.citation_abstract_html_url = $handle google.citation_fulltext_html_url = google.citation_pdf_url = $simple-pdf -google.citation_keywords = dc.subject, dc.type +google.citation_keywords = dcterms.subject, dcterms.type -google.citation_journal_title = -google.citation_volume = -google.citation_issue = +google.citation_journal_title = cg.journal +google.citation_volume = cg.volume +google.citation_issue = cg.issue google.citation_firstpage = google.citation_lastpage = -google.citation_doi = -google.citation_issn = dc.identifier.issn -google.citation_isbn = dc.identifier.isbn +google.citation_doi = cg.identifier.doi +google.citation_issn = cg.issn +google.citation_isbn = cg.isbn google.citation_conference = # Type-specific fields retrieved when one of the above identifiers # is matched for the item. google.citation_dissertation_name = dc.title -google.citation_dissertation_institution = dc.publisher +google.citation_dissertation_institution = dcterms.publisher # Patent country for patent items; needs to be encoded as # a list of ISO 3166-1 alpha-3 codes per @@ -71,7 +71,7 @@ google.citation_patent_country = google.citation_patent_number = google.citation_technical_report_number = -google.citation_technical_report_institution = dc.publisher +google.citation_technical_report_institution = dcterms.publisher #priority "allow list" for citation_pdf_url, shortnames are defined in dspace/config/registries/bitstream-formats.xml #priority order is defined here, where the first type is the most important diff --git a/dspace/config/crosswalks/xhtml-head-item.properties b/dspace/config/crosswalks/xhtml-head-item.properties index f7ba355fd5a5..8206a23636ae 100644 --- a/dspace/config/crosswalks/xhtml-head-item.properties +++ b/dspace/config/crosswalks/xhtml-head-item.properties @@ -7,6 +7,7 @@ schema.DC = http://purl.org/dc/elements/1.1/ schema.DCTERMS = http://purl.org/dc/terms/ +schema.CG = https://agriculturalsemantics.github.io/cg-core/cgcore.html ####### Metadata field mappings ####### @@ -38,23 +39,23 @@ dc.date.accessioned = DCTERMS.dateAccepted,DCTERMS.W3CDTF dc.date.available = DCTERMS.available,DCTERMS.W3CDTF dc.date.copyright = DCTERMS.dateCopyrighted,DCTERMS.W3CDTF dc.date.created = DCTERMS.created,DCTERMS.W3CDTF -dc.date.issued = DCTERMS.issued,DCTERMS.W3CDTF +dcterms.issued = DCTERMS.issued,DCTERMS.W3CDTF dc.identifier = DC.identifier -dc.identifier.citation = DCTERMS.bibliographicCitation +dcterms.bibliographicCitation = DCTERMS.bibliographicCitation dc.identifier.uri = DC.identifier,DCTERMS.URI -dc.description = DC.description -dc.description.abstract = DCTERMS.abstract +dcterms.description = DC.description +dcterms.abstract = DCTERMS.abstract dc.description.tableofcontents = DCTERMS.tableOfContents dc.description.uri = DC.description,DCTERMS.URI dc.format = DC.format -dc.format.extent = DCTERMS.extent +dcterms.extent = DCTERMS.extent dc.format.medium = DCTERMS.medium dc.language = DC.language -dc.language.iso = DC.language,DCTERMS.RFC1766 -dc.publisher = DC.publisher -dc.relation = DC.relation +dcterms.language = DC.language,DCTERMS.RFC1766 +dcterms.publisher = DC.publisher +dcterms.relation = DC.relation dc.relation.isformatof = DCTERMS.isFormatOf -dc.relation.ispartof = DCTERMS.isPartOf +dcterms.isPartOf = DCTERMS.isPartOf dc.relation.haspart = DCTERMS.hasPart dc.relation.isversionof = DCTERMS.isVersionOf dc.relation.hasversion = DCTERMS.hasVersion @@ -65,13 +66,15 @@ dc.relation.isreplacedby = DCTERMS.isReplacedBy dc.relation.uri = DC.relation,DCTERMS.URI dc.rights = DC.rights dc.rights.uri = DC.rights,DCTERMS.URI +dcterms.license = DCTERMS.license dc.source = DC.source dc.source.uri = DC.source,DCTERMS.URI -dc.subject = DC.subject +dcterms.subject = DC.subject dc.subject.ddc = DC.subject,DCTERMS.DDC dc.subject.lcc = DC.subject,DCTERMS.LCC dc.subject.lcsh = DC.subject,DCTERMS.LCSH dc.subject.mesh = DC.subject,DCTERMS.MESH dc.title = DC.title dc.title.alternative = DCTERMS.alternative -dc.type = DC.type +dcterms.type = DCTERMS.type +dcterms.accessRights = DCTERMS.accessRights From 486a8bde8f0d67a1c34bc3781052eb34bc0436ee Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 30 Oct 2023 10:08:04 +0300 Subject: [PATCH 058/119] Revert "dspace/config: disable ldap groupmap" This reverts commit 8ed657958dbab884daea84d537553ad17ef5e620. The bug was fixed. See: https://github.com/DSpace/DSpace/pull/9152 --- dspace/config/modules/authentication-ldap.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/modules/authentication-ldap.cfg b/dspace/config/modules/authentication-ldap.cfg index 50ae87f98809..b2916b88e396 100644 --- a/dspace/config/modules/authentication-ldap.cfg +++ b/dspace/config/modules/authentication-ldap.cfg @@ -149,7 +149,7 @@ authentication-ldap.search_scope = 2 # in user's full DN. If it's found, assign user to the DSpace group # specified by the right part of the groupmap value (after the ":"). # One user may belong to multiple groups. -#authentication-ldap.login.groupmap.1 = OU=ILRIHUB:ILRI_LDAP_USERS +authentication-ldap.login.groupmap.1 = OU=ILRIHUB:ILRI_LDAP_USERS #authentication-ldap.login.groupmap.3 = ou=ldap-dept3:dspace-groupA # If this property is uncommented, it changes the meaning of the left part of From 0f561b53f65ec3a3a7c951b97dcecff947549293 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 2 Nov 2023 15:16:22 +0300 Subject: [PATCH 059/119] dspace/config/log4j2.xml: reduce noisy logs mwood on Slack suggested this for reducing the unhelpful and noisy logs like: 2023-11-01 00:00:07,581 INFO a2151e44-c888-4fbc-a54e-b71edeb5cc45 1b74f2ee-d4cc-473b-962c-65e6f43c9544 org.dspace.app.rest.utils.DSpaceAPIRequestLoggingFilter @ Before request [GET /server/api/core/items/f5e081f4-1c45-4dbf-8cac-2126119f87e5] originated from / --- dspace/config/log4j2.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dspace/config/log4j2.xml b/dspace/config/log4j2.xml index 6e9a43e4f0fe..3e73a134e93d 100644 --- a/dspace/config/log4j2.xml +++ b/dspace/config/log4j2.xml @@ -97,6 +97,10 @@ + + + From 6a0b609e1f773f5100d5d99c250ed27d27fa7914 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 6 Nov 2023 10:18:43 +0300 Subject: [PATCH 060/119] discovery.xml: minor adjustment to date sorting Make sure that we have both ascending and descending whenever there is a date available for sorting. Also, change the default sort order for sortDateIssued to asc as there is already sortDateIssuedDesc. See: https://github.com/DSpace/DSpace/issues/9104 --- dspace/config/spring/api/discovery.xml | 37 +++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index ddd4b2244ddb..b165c461bbba 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -214,7 +214,9 @@ + + @@ -362,7 +364,9 @@ + + @@ -501,7 +505,9 @@ + + @@ -643,7 +649,9 @@ + + @@ -771,6 +779,7 @@ + @@ -846,6 +855,7 @@ + @@ -922,6 +932,7 @@ + @@ -998,6 +1009,7 @@ + @@ -1073,6 +1085,7 @@ + @@ -1157,7 +1170,9 @@ + + @@ -1227,7 +1242,9 @@ + + @@ -1291,6 +1308,7 @@ + @@ -1353,6 +1371,7 @@ + @@ -1526,6 +1545,7 @@ + @@ -1590,6 +1610,7 @@ + @@ -1651,6 +1672,7 @@ + @@ -1711,6 +1733,7 @@ + @@ -1771,6 +1794,7 @@ + @@ -1830,6 +1854,7 @@ + @@ -1890,6 +1915,7 @@ + @@ -1949,6 +1975,7 @@ + @@ -2021,6 +2048,7 @@ + @@ -2080,6 +2108,7 @@ + @@ -3052,9 +3081,15 @@ - + + + + + + + From 2c75ee7da9c3e2bb4473c818a02ad613abd94b0d Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 23 Nov 2023 12:21:20 +0300 Subject: [PATCH 061/119] dspace/config: add more bot overrides --- dspace/config/spiders/agents/ilri | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dspace/config/spiders/agents/ilri b/dspace/config/spiders/agents/ilri index 4f1da09aa0ef..70d23c840dda 100644 --- a/dspace/config/spiders/agents/ilri +++ b/dspace/config/spiders/agents/ilri @@ -40,3 +40,9 @@ Scoop\.it WebAPIClient RStudio ^MEL +GuzzleHttp +Owler +newspaperjs +^Chrome$ +curl +^mozilla From 43f9017f5f3db99471bc619999bd7bc82dcb0df6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 30 Nov 2023 14:30:36 +0300 Subject: [PATCH 062/119] dspace/config: update OAI mods crosswalk FAO highlighted that the markup for alternative titles is slightly different. See: https://www.loc.gov/standards/mods/userguide/titleinfo.html --- dspace/config/crosswalks/oai/metadataFormats/mods.xsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/crosswalks/oai/metadataFormats/mods.xsl b/dspace/config/crosswalks/oai/metadataFormats/mods.xsl index ad169b099c75..9f781e39ef05 100644 --- a/dspace/config/crosswalks/oai/metadataFormats/mods.xsl +++ b/dspace/config/crosswalks/oai/metadataFormats/mods.xsl @@ -206,8 +206,8 @@ - - <xsl:value-of select="." /> + + <xsl:value-of select="." /> From 8cd0496e6e376defa5378347ed09ab4c96fe4e89 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 1 Dec 2023 10:49:22 +0300 Subject: [PATCH 063/119] dspace/config: update ORCID in submission form Clarify that these are not just for CGIAR authors, but recommended. --- dspace/config/submission-forms.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 84f123bc721a..f819cd150797 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -109,10 +109,10 @@ creator identifier true - + cg-creator-identifier onebox - Enter ORCID identifiers for CGIAR authors, one per author. If an identifier is missing from the list, enter a new one in the exact same format (Name: 0000-0002-1735-7458). Use the exact name style the author uses at https://orcid.org + Enter ORCID identifiers (for CGIAR authors at least). Click below to see a pre-populated list of author ORCID identifiers that you can select from. Enter one per author. If an identifier is missing, enter a new one in the same format (Name: 0000-0002-1735-7458). Use the exact name style the author uses at https://orcid.org From 331af88f1997f81e593f90de5eaad05980eb9779 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 5 Dec 2023 19:52:25 +0300 Subject: [PATCH 064/119] dspace/config: make DOI non-repeatable in submission form It is not repeatable in our current production DSpace 6 instance so I don't know why it is repeatable here. --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index f819cd150797..352636d09aad 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -408,7 +408,7 @@ cg identifier doi - true + false onebox From 0fc31ce79efedceb5766ded2fe79cbe921147235 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 5 Dec 2023 19:54:02 +0300 Subject: [PATCH 065/119] dspace/config: make series name/number not repeatable This does not need to be repeatable, and indeed is not in our prod- uction DSpace 6 instance. --- dspace/config/submission-forms.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 352636d09aad..3e8e21c586b1 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -269,7 +269,7 @@ dcterms isPartOf - true + false onebox @@ -280,7 +280,7 @@ cg number - true + false onebox From 8cc75811ea0c206654955b0cf4c0f67271479095 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 12 Dec 2023 21:56:00 +0300 Subject: [PATCH 066/119] dspace/config/spiders: remove ^mozilla from spiders Because DSpace applies these case-insensitively, this means that we excluded real hits from Mozilla user agents for the past two weeks. --- dspace/config/spiders/agents/ilri | 1 - 1 file changed, 1 deletion(-) diff --git a/dspace/config/spiders/agents/ilri b/dspace/config/spiders/agents/ilri index 70d23c840dda..b89f6281bf97 100644 --- a/dspace/config/spiders/agents/ilri +++ b/dspace/config/spiders/agents/ilri @@ -45,4 +45,3 @@ Owler newspaperjs ^Chrome$ curl -^mozilla From a54634b7a841244220970d38ecea3d7bf88a7f77 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 27 Dec 2023 10:56:10 +0300 Subject: [PATCH 067/119] dspace/config: make related reference repeatable Requested by IFPRI. --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 3e8e21c586b1..3d5bc0b41aa1 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -699,7 +699,7 @@ dcterms relation - false + true onebox Enter related reference link (normally a URL to another item). From 26f0aee2a4a1d09824fdbd94c22cabb9a3f97735 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 2 Jan 2024 20:48:27 +0300 Subject: [PATCH 068/119] Add country code tagger curation task From the cgspace-java-helpers. --- dspace/config/dspace.cfg | 4 ++++ dspace/config/modules/countrycodetagger.cfg | 8 ++++++++ dspace/config/modules/countrycodetagger.force.cfg | 8 ++++++++ dspace/config/modules/curate.cfg | 2 ++ dspace/modules/additions/pom.xml | 5 +++++ 5 files changed, 27 insertions(+) create mode 100644 dspace/config/modules/countrycodetagger.cfg create mode 100644 dspace/config/modules/countrycodetagger.force.cfg diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 41ae91bf277f..8a311054ad9d 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1677,3 +1677,7 @@ include = ${module_dir}/usage-statistics.cfg include = ${module_dir}/versioning.cfg include = ${module_dir}/workflow.cfg include = ${module_dir}/external-providers.cfg + +# Configuration for CGSpace curation tasks +include = ${module_dir}/countrycodetagger.cfg +include = ${module_dir}/countrycodetagger.force.cfg diff --git a/dspace/config/modules/countrycodetagger.cfg b/dspace/config/modules/countrycodetagger.cfg new file mode 100644 index 000000000000..e074715fa731 --- /dev/null +++ b/dspace/config/modules/countrycodetagger.cfg @@ -0,0 +1,8 @@ +# name of the field containing ISO 3166-1 country names +countrycodetagger.iso3166.field = cg.coverage.country + +# name of the field containing ISO 3166-1 Alpha2 country codes +countrycodetagger.iso3166-alpha2.field = cg.coverage.iso3166-alpha2 + +# only add country codes if an item doesn't have any (default false) +#countrycodetagger.forceupdate = false diff --git a/dspace/config/modules/countrycodetagger.force.cfg b/dspace/config/modules/countrycodetagger.force.cfg new file mode 100644 index 000000000000..5425ecb94cbf --- /dev/null +++ b/dspace/config/modules/countrycodetagger.force.cfg @@ -0,0 +1,8 @@ +# name of the field containing ISO 3166-1 country names +countrycodetagger.force.iso3166.field = cg.coverage.country + +# name of the field containing ISO 3166-1 Alpha2 country codes +countrycodetagger.force.iso3166-alpha2.field = cg.coverage.iso3166-alpha2 + +# clear existing country codes and add new ones +countrycodetagger.force.forceupdate = true diff --git a/dspace/config/modules/curate.cfg b/dspace/config/modules/curate.cfg index 1d7b87960df1..df6f1d17c572 100644 --- a/dspace/config/modules/curate.cfg +++ b/dspace/config/modules/curate.cfg @@ -17,6 +17,8 @@ plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.MetadataV plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RegisterDOI = registerdoi #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.CitationPage = citationpage # add new tasks here (or in additional config files) +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger.force ## task queue implementation plugin.single.org.dspace.curate.TaskQueue = org.dspace.curate.FileTaskQueue diff --git a/dspace/modules/additions/pom.xml b/dspace/modules/additions/pom.xml index 7e60e982ec45..7dc205018c41 100644 --- a/dspace/modules/additions/pom.xml +++ b/dspace/modules/additions/pom.xml @@ -286,6 +286,11 @@ mockito-inline test + + io.github.ilri.cgspace + cgspace-java-helpers + 7.6.1-SNAPSHOT + From 626aefa400e6619790be96b3a6951ba9819fc3ee Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 8 Jan 2024 12:30:20 +0300 Subject: [PATCH 069/119] dspace/config: add CGIAR Trust Fund to sponsors --- .../cg-contributor-donor.xml | 1337 +++++++++-------- 1 file changed, 669 insertions(+), 668 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml index 05ee10325302..bde44dc2b2bf 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-donor.xmlrom f311b4609d3060130d8eb6de0fcafb0202a31262 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 8 Jan 2024 15:34:58 +0300 Subject: [PATCH 070/119] Add ilri scripts Copied from the 6_x-prod branch. --- ilri/add_dc_rights.py | 243 ++++++++++++++++ ilri/add_orcid_identifiers_csv.py | 247 ++++++++++++++++ ilri/agrovoc_lookup.py | 253 ++++++++++++++++ ilri/bing-networks-to-ips.sh | 15 + ilri/check-spider-hits.sh | 237 +++++++++++++++ ilri/check-spider-ip-hits.sh | 170 +++++++++++ ilri/check_duplicates.py | 319 ++++++++++++++++++++ ilri/check_duplicates_fuzzy.py | 321 ++++++++++++++++++++ ilri/countries_to_csv.py | 60 ++++ ilri/create-value-pairs.sh | 15 + ilri/crossref_doi_lookup.py | 440 ++++++++++++++++++++++++++++ ilri/crossref_funders_lookup.py | 190 ++++++++++++ ilri/crossref_issn_lookup.py | 153 ++++++++++ ilri/delete_metadata_values.py | 152 ++++++++++ ilri/doi_to_handle.py | 165 +++++++++++ ilri/fix_initiative_mappings.py | 323 ++++++++++++++++++++ ilri/fix_maxmind_stats.py | 64 ++++ ilri/fix_metadata_values.py | 199 +++++++++++++ ilri/generate_solr_statistics.py | 167 +++++++++++ ilri/generate_thumbnails.py | 178 ++++++++++++ ilri/get_pdfs_dspace.py | 146 ++++++++++ ilri/get_pdfs_scihub.py | 117 ++++++++ ilri/get_pdfs_unpaywall.py | 163 +++++++++++ ilri/iso3166_lookup.py | 174 +++++++++++ ilri/iso_639_value_pairs.py | 25 ++ ilri/migrate-fields.sh | 113 +++++++ ilri/move-collections.sh | 79 +++++ ilri/move_metadata_values.py | 158 ++++++++++ ilri/orcid_authority_to_item.py | 317 ++++++++++++++++++++ ilri/parse_iso_codes.py | 98 +++++++ ilri/post_bitstreams.py | 469 ++++++++++++++++++++++++++++++ ilri/post_ciat_pdfs.py | 360 +++++++++++++++++++++++ ilri/resolve_addresses.py | 198 +++++++++++++ ilri/resolve_addresses_geoip2.py | 246 ++++++++++++++++ ilri/resolve_orcids.py | 311 ++++++++++++++++++++ ilri/rest_find_collections.py | 171 +++++++++++ ilri/ror_lookup.py | 193 ++++++++++++ ilri/sherpa_issn_lookup.py | 149 ++++++++++ ilri/subdivision_lookup.py | 127 ++++++++ ilri/update_orcids.py | 171 +++++++++++ ilri/util.py | 159 ++++++++++ 41 files changed, 7855 insertions(+) create mode 100755 ilri/add_dc_rights.py create mode 100755 ilri/add_orcid_identifiers_csv.py create mode 100755 ilri/agrovoc_lookup.py create mode 100755 ilri/bing-networks-to-ips.sh create mode 100755 ilri/check-spider-hits.sh create mode 100755 ilri/check-spider-ip-hits.sh create mode 100755 ilri/check_duplicates.py create mode 100755 ilri/check_duplicates_fuzzy.py create mode 100755 ilri/countries_to_csv.py create mode 100755 ilri/create-value-pairs.sh create mode 100755 ilri/crossref_doi_lookup.py create mode 100755 ilri/crossref_funders_lookup.py create mode 100755 ilri/crossref_issn_lookup.py create mode 100755 ilri/delete_metadata_values.py create mode 100755 ilri/doi_to_handle.py create mode 100755 ilri/fix_initiative_mappings.py create mode 100755 ilri/fix_maxmind_stats.py create mode 100755 ilri/fix_metadata_values.py create mode 100755 ilri/generate_solr_statistics.py create mode 100755 ilri/generate_thumbnails.py create mode 100755 ilri/get_pdfs_dspace.py create mode 100755 ilri/get_pdfs_scihub.py create mode 100755 ilri/get_pdfs_unpaywall.py create mode 100755 ilri/iso3166_lookup.py create mode 100755 ilri/iso_639_value_pairs.py create mode 100755 ilri/migrate-fields.sh create mode 100755 ilri/move-collections.sh create mode 100755 ilri/move_metadata_values.py create mode 100755 ilri/orcid_authority_to_item.py create mode 100755 ilri/parse_iso_codes.py create mode 100755 ilri/post_bitstreams.py create mode 100755 ilri/post_ciat_pdfs.py create mode 100755 ilri/resolve_addresses.py create mode 100755 ilri/resolve_addresses_geoip2.py create mode 100755 ilri/resolve_orcids.py create mode 100755 ilri/rest_find_collections.py create mode 100755 ilri/ror_lookup.py create mode 100755 ilri/sherpa_issn_lookup.py create mode 100755 ilri/subdivision_lookup.py create mode 100755 ilri/update_orcids.py create mode 100644 ilri/util.py diff --git a/ilri/add_dc_rights.py b/ilri/add_dc_rights.py new file mode 100755 index 000000000000..7ba1cd0f8667 --- /dev/null +++ b/ilri/add_dc_rights.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# +# add-dc-rights.py 1.1.2 +# +# Copyright Alan Orth. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# --- +# +# Add usage rights (dc.rights) to items from CSV. +# +# This script searches for items by handle and adds a dc.rights field to each +# (assuming one does not exist). The format of the CSV file should be: +# +# dc.rights,handle +# CC-BY-NC-ND,10568/72643 +# CC-BY-NC-ND,10568/72644 +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (for example, in a virtual environment): +# +# $ pip install colorama psycopg2-binary +# + +import argparse +import csv +import signal +import sys + +import psycopg2 +from colorama import Fore + + +def main(): + # parse the command line arguments + parser = argparse.ArgumentParser(description="Add usage rights to items from CSV.") + parser.add_argument( + "-i", + "--csv-file", + help="CSV file containing item handles and rights.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument("-db", "--database-name", help="Database name", required=True) + parser.add_argument( + "-u", "--database-user", help="Database username", required=True + ) + parser.add_argument( + "-p", "--database-pass", help="Database password", required=True + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "-hf", + "--handle-field-name", + help='Name of column with handles in "10568/4" format (no URL).', + default="handle", + ) + parser.add_argument( + "-rf", + "--rights-field-name", + help="Name of column with usage rights.", + default="dc.rights", + ) + args = parser.parse_args() + + # set the signal handler for SIGINT (^C) so we can exit cleanly + signal.signal(signal.SIGINT, signal_handler) + + # connect to database + try: + conn_string = "dbname={database_name} user={database_user} password={database_pass} host=localhost".format( + database_name=args.database_name, + database_user=args.database_user, + database_pass=args.database_pass, + ) + conn = psycopg2.connect(conn_string) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Connected to the database.\n" + Fore.RESET) + except psycopg2.OperationalError: + sys.stderr.write(Fore.RED + "Unable to connect to the database.\n" + Fore.RESET) + + # close output file before we exit + args.csv_file.close() + + exit(1) + + # open the CSV + reader = csv.DictReader(args.csv_file) + + # iterate over rows in the CSV + for row in reader: + handle = row[args.handle_field_name] + rights = row[args.rights_field_name] + + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Finding item with handle {handle}\n".format(handle=handle) + + Fore.RESET + ) + + with conn: + # cursor will be closed after this block exits + # see: http://initd.org/psycopg/docs/usage.html#with-statement + with conn.cursor() as cursor: + # get resource_id for current handle + sql = "SELECT resource_id FROM handle WHERE handle=%s" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (handle,)) + + # no resource_id with this handle exists + if cursor.rowcount == 0: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Did not find item with handle {handle}, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + + continue + + # multiple resource_id with this handle exist (I don't think this will happen, but better to check) + elif cursor.rowcount > 1: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found multiple items with handle {handle}, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + + continue + + result = cursor.fetchone() + # result will be an array like: [74525] + resource_id = result[0] + + # in our test environment I've seen resource_id be NULL for some reason + if resource_id is None: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Item with handle {handle} does not have a resource_id, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + + continue + + # Check if this item already has dc.rights metadata + # resource_type_id 2 is for item metadata, metadata_field_id 53 is dc.rights + sql = "SELECT text_value FROM metadatavalue WHERE resource_type_id=2 AND resource_id=%s AND metadata_field_id=53" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (resource_id,)) + + # if rowcount is greater than 0 there must be existing rights for this item + if cursor.rowcount > 0: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found existing rights metadata for item with handle {handle}, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + continue + + # no existing rights metadata, so add one + result = cursor.fetchone() + + if args.dry_run: + print( + Fore.GREEN + + 'Would add rights "{rights}" to item with handle {handle}.\n'.format( + rights=rights, handle=handle + ) + + Fore.RESET + ) + continue + + if args.debug: + sys.stderr.write( + Fore.GREEN + + 'Adding rights "{rights}" to item with handle {handle}.\n'.format( + rights=rights, handle=handle + ) + + Fore.RESET + ) + + # metadatavalue IDs come from a PostgreSQL sequence that increments when you call it + cursor.execute("SELECT nextval('metadatavalue_seq')") + metadata_value_id = cursor.fetchone()[0] + + # resource_type_id 2 is for item metadata, metadata_field_id 53 is dc.rights + sql = "INSERT INTO metadatavalue (metadata_value_id, resource_id, metadata_field_id, text_value, place, confidence, resource_type_id) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute( + sql, (metadata_value_id, resource_id, 53, rights, 1, -1, 2) + ) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Disconnecting from database.\n" + Fore.RESET) + + # close the database connection before leaving + conn.close() + + # close output file before we exit + args.csv_file.close() + + +def signal_handler(signal, frame): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ilri/add_orcid_identifiers_csv.py b/ilri/add_orcid_identifiers_csv.py new file mode 100755 index 000000000000..b4973b3a05f7 --- /dev/null +++ b/ilri/add_orcid_identifiers_csv.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# +# add-orcid-identifiers-csv.py v1.1.6 +# +# Copyright Alan Orth. + +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Add ORCID identifiers to items for a given author name from CSV. +# +# We had previously migrated the ORCID identifiers from CGSpace's authority Solr +# core to cg.creator.identifier fields in matching items, but now we want to add +# them to # other matching items in a more arbitrary fashion. Items that are ol- +# der or were uploaded in batch did not have matching authors in the authority +# core, so they did not benefit from that migration, for example. +# +# This script searches for items by author name and adds a cg.creator.identifier +# field to each (assuming one does not exist). The format of the CSV file should +# be: +# +# dc.contributor.author,cg.creator.identifier +# "Orth, Alan",Alan S. Orth: 0000-0002-1735-7458 +# "Orth, A.",Alan S. Orth: 0000-0002-1735-7458 +# +# The order of authors in dc.contributor.author is respected and mirrored in the +# new cg.creator.identifier fields. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama +# + +import argparse +import csv +import logging +import re +import signal +import sys + +import util +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def main(): + # parse the command line arguments + parser = argparse.ArgumentParser( + description="Add ORCID identifiers to items for a given author name from CSV. Respects the author order from the dc.contributor.author field." + ) + parser.add_argument( + "--author-field-name", + "-f", + help="Name of column with author names.", + default="dc.contributor.author", + ) + parser.add_argument( + "--csv-file", + "-i", + help="CSV file containing author names and ORCID identifiers.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument("--database-name", "-db", help="Database name", required=True) + parser.add_argument( + "--database-user", "-u", help="Database username", required=True + ) + parser.add_argument( + "--database-pass", "-p", help="Database password", required=True + ) + parser.add_argument( + "--debug", + "-d", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "--dry-run", + "-n", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "--orcid-field-name", + "-o", + help='Name of column with creators in "Name: 0000-0000-0000-0000" format.', + default="cg.creator.identifier", + ) + args = parser.parse_args() + + # The default log level is WARNING, but we want to set it to DEBUG or INFO + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + # Set the global log format + logging.basicConfig(format="[%(levelname)s] %(message)s") + + # set the signal handler for SIGINT (^C) so we can exit cleanly + signal.signal(signal.SIGINT, signal_handler) + + # connect to database + conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" + ) + + cursor = conn.cursor() + + # open the CSV + reader = csv.DictReader(args.csv_file) + + # iterate over rows in the CSV + for row in reader: + author_name = row[args.author_field_name] + + logger.debug( + Fore.GREEN + f"Finding items with author name: {author_name}" + Fore.RESET + ) + + # find all item metadata records with this author name + # metadata_field_id 3 is author + sql = "SELECT dspace_object_id, place FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=3 AND text_value=%s" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (author_name,)) + records_with_author_name = cursor.fetchall() + + if len(records_with_author_name) > 0: + logger.debug( + Fore.GREEN + + f"> Found {len(records_with_author_name)} items." + + Fore.RESET + ) + + # extract cg.creator.identifier text to add from CSV and strip leading/trailing whitespace + text_value = row[args.orcid_field_name].strip() + # extract the ORCID identifier from the cg.creator.identifier text field in the CSV + orcid_identifier_pattern = re.compile( + r"[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}" + ) + orcid_identifier_match = orcid_identifier_pattern.search(text_value) + + # sanity check to make sure we extracted the ORCID identifier from the cg.creator.identifier text in the CSV + if orcid_identifier_match is None: + logger.debug( + Fore.YELLOW + + f'Skipping invalid ORCID identifier in "{text_value}".' + + Fore.RESET + ) + continue + + # we only expect one ORCID identifier, so if it matches it will be group "0" + # see: https://docs.python.org/3/library/re.html + orcid_identifier = orcid_identifier_match.group(0) + + # iterate over results for current author name to add cg.creator.identifier metadata + for record in records_with_author_name: + dspace_object_id = record[0] + # "place" is the order of a metadata value so we can add the cg.creator.identifier metadata matching the author order + place = record[1] + confidence = -1 + + # get the metadata_field_id for the cg.creator.identifier field + metadata_field_id = util.field_name_to_field_id( + cursor, "cg.creator.identifier" + ) + + # check if there is an existing cg.creator.identifier with this author's ORCID identifier for this item (without restricting the "place") + # note that the SQL here is quoted differently to allow us to use LIKE with % wildcards with our paremeter subsitution + sql = "SELECT * from metadatavalue WHERE dspace_object_id=%s AND metadata_field_id=%s AND text_value LIKE '%%' || %s || '%%' AND confidence=%s AND dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn)" + + cursor.execute( + sql, + ( + dspace_object_id, + metadata_field_id, + orcid_identifier, + confidence, + ), + ) + records_with_orcid_identifier = cursor.fetchall() + + if len(records_with_orcid_identifier) == 0: + if args.dry_run: + logger.info( + Fore.YELLOW + + f'(DRY RUN) Adding ORCID identifier "{text_value}" to item {dspace_object_id}' + + Fore.RESET + ) + + continue + + logger.info( + Fore.YELLOW + + f'Adding ORCID identifier "{text_value}" to item {dspace_object_id}' + + Fore.RESET + ) + + # metadatavalue IDs come from a PostgreSQL sequence that increments when you call it + cursor.execute("SELECT nextval('metadatavalue_seq')") + metadata_value_id = cursor.fetchone()[0] + + sql = "INSERT INTO metadatavalue (metadata_value_id, dspace_object_id, metadata_field_id, text_value, place, confidence) VALUES (%s, %s, %s, %s, %s, %s)" + cursor.execute( + sql, + ( + metadata_value_id, + dspace_object_id, + metadata_field_id, + text_value, + place, + confidence, + ), + ) + + # Update the last_modified date for each item + util.update_item_last_modified(cursor, dspace_object_id) + else: + logger.debug( + Fore.GREEN + + f"Item {dspace_object_id} already has an ORCID identifier for {text_value}." + + Fore.RESET + ) + + logger.debug("Disconnecting from database.") + + # commit the changes + if not args.dry_run: + conn.commit() + + # close the database connection before leaving + conn.close() + + # close output file before we exit + args.csv_file.close() + + +def signal_handler(signal, frame): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ilri/agrovoc_lookup.py b/ilri/agrovoc_lookup.py new file mode 100755 index 000000000000..ea267d6d6e9c --- /dev/null +++ b/ilri/agrovoc_lookup.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +# +# agrovoc-lookup.py 0.4.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public AGROVOC REST API for subjects read from a text file. Text +# file should have one subject per line. Results are saved to a CSV including +# the subject, the language, the match type, and the total number of matches. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read subjects from a text file, one per line +def read_subjects_from_file(): + # initialize an empty list for subjects + subjects = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add subjects that aren't already present + if line not in subjects: + subjects.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_subjects(subjects) + + +def resolve_subjects(subjects): + fieldnames = ["subject", "language", "match type", "number of matches"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with thirty days expiry, as AGROVOC only + # makes new releases monthly so this should be safe. + expire_after = timedelta(days=30) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for subject in subjects: + if args.debug: + sys.stderr.write( + Fore.GREEN + + f"Looking up the subject: {subject} ({'any' or args.language})\n" + + Fore.RESET + ) + + request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" + request_params = {"query": subject} + + if args.language: + # use user specified language + request_params.update(lang=args.language) + + request = requests.get(request_url, params=request_params) + + if request.status_code == requests.codes.ok: + data = request.json() + + # Assume no match + matched = False + + number_of_matches = len(data["results"]) + + # no results means no match + if number_of_matches == 0: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {subject!r} in AGROVOC (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow( + { + "subject": subject, + "language": "", + "match type": "", + "number of matches": number_of_matches, + } + ) + elif number_of_matches >= 1: + for result in request.json()["results"]: + # if there is more than one result we need to check each for + # a preferred or matchedPreLabel match first. If there are + # none then we can check each result again for an altLabel + # matches.alternate match. Note that we need to make sure + # they actually exist before attempting to reference them. + # If they don't exist then we'll catch the exception and set + # the values to None. + # + # Note that matchedPrefLabel is not a property in the SKOS/ + # SKOSXL vocabulary. It seems to be a hint returned by the + # SKOSMOS server to indicate that the search term matched + # the prefLabel of some language. + try: + result["prefLabel"] + except KeyError: + result["prefLabel"] = None + + try: + result["matchedPrefLabel"] + except KeyError: + result["matchedPrefLabel"] = None + + # upper case our subject and the AGROVOC result to make sure + # we're comparing the same thing because AGROVOC returns the + # title case like "Iran" no matter whether you search for + # "IRAN" or "iran". + if ( + result["prefLabel"] + and subject.upper() == result["prefLabel"].upper() + ): + matched = True + language = result["lang"] + print( + f"Match for {subject!r} in AGROVOC {language} (cached: {request.from_cache})" + ) + + writer.writerow( + { + "subject": subject, + "language": language, + "match type": "prefLabel", + "number of matches": number_of_matches, + } + ) + + break + elif ( + result["matchedPrefLabel"] + and subject.upper() == result["matchedPrefLabel"].upper() + ): + matched = True + language = result["lang"] + print( + f"Match for {subject!r} in AGROVOC {language} (cached: {request.from_cache})" + ) + + writer.writerow( + { + "subject": subject, + "language": language, + "match type": "prefLabel", + "number of matches": number_of_matches, + } + ) + + break + + # If we're here we assume there were no matches for prefLabel or + # matchedPrefLabel in the results, so now we will check for an + # altLabel match. + if not matched: + for result in request.json()["results"]: + # make sure key exists before trying to access it + try: + result["altLabel"] + except KeyError: + result["altLabel"] = None + + if ( + result["altLabel"] + and subject.upper() == result["altLabel"].upper() + ): + matched = True + language = result["lang"] + print( + f"Match for {subject!r} in AGROVOC {language} (cached: {request.from_cache})" + ) + + writer.writerow( + { + "subject": subject, + "language": language, + "match type": "altLabel", + "number of matches": number_of_matches, + } + ) + + break + + # close output files before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output files before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the AGROVOC REST API to validate subject terms from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing subject terms to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-l", "--language", help="Language to query terms (example en, default any)." +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the addresses from there +if args.input_file: + read_subjects_from_file() + +exit() diff --git a/ilri/bing-networks-to-ips.sh b/ilri/bing-networks-to-ips.sh new file mode 100755 index 000000000000..e98ec716939e --- /dev/null +++ b/ilri/bing-networks-to-ips.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# +# Latest as of 2022-07-06. For printing the IPs in each CIDR network so that I +# can purge them all from Solr statistics using check-spider-ip-hits.sh. + +BINGBOT_JSON_URL=https://www.bing.com/toolbox/bingbot.json +# Extract the networks from the JSON (I wrote this using https://jqplay.org/) +BINGBOT_NETWORKS=$(http "$BINGBOT_JSON_URL" \ + | jq --raw-output '.["prefixes"][].ipv4Prefix') + +for network in $BINGBOT_NETWORKS; do + # Use prips to print IPs in given CIDR and strip network and broadcast. + # See: https://stackoverflow.com/a/52501093/1996540 + prips "$network" | sed -e '1d; $d' +done diff --git a/ilri/check-spider-hits.sh b/ilri/check-spider-hits.sh new file mode 100755 index 000000000000..9d6cc2bab2f9 --- /dev/null +++ b/ilri/check-spider-hits.sh @@ -0,0 +1,237 @@ +#!/usr/bin/env bash +# +# check-spider-hits.sh v1.2.0 +# +# Copyright (C) 2019-2020 Alan Orth +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Exit on first error +set -o errexit + +# defaults +readonly DEF_SPIDERS_PATTERN_FILE=/dspace/config/spiders/agents/example +readonly DEF_SOLR_URL=http://localhost:8081/solr +readonly DEF_STATISTICS_SHARD=statistics + +###### + +readonly PROGNAME=$(basename $0) +readonly ARGS="$@" + +function usage() { + cat <<-EOF +Usage: $PROGNAME [-d] [-f $DEF_SPIDERS_PATTERN_FILE] [-p] [-s $DEF_STATISTICS_SHARD] [-u $DEF_SOLR_URL] + +Optional arguments: + -d: print debug messages + -f: path to file containing spider user agent patterns¹ (default: $DEF_SPIDERS_PATTERN_FILE) + -p: purge statistics that match spider user agents + -s: Solr statistics shard, for example statistics or statistics-2018² (default: $DEF_STATISTICS_SHARD) + -u: URL to Solr (default: $DEF_SOLR_URL) + +Written by: Alan Orth + +¹ DSpace ships an "example" pattern file that works well. Another option is the patterns file maintained by the COUNTER-Robots project. +² If your statistics core has been split into yearly "shards" by DSpace's stats-util you need to search each shard separately. +EOF + + exit 0 +} + +function parse_options() { + while getopts ":df:ps:u:" opt; do + case $opt in + d) + DEBUG=yes + ;; + f) + SPIDERS_PATTERN_FILE=$OPTARG + + if ! [[ -r "$SPIDERS_PATTERN_FILE" ]]; then + echo "(ERROR) Spider patterns file \"$SPIDERS_PATTERN_FILE\" doesn't exist." + + exit 1 + fi + ;; + p) + PURGE_SPIDER_HITS=yes + ;; + s) + STATISTICS_SHARD=$OPTARG + ;; + u) + # make sure -s is passed something like a URL + if ! [[ "$OPTARG" =~ ^https?://.*$ ]]; then + usage + fi + + SOLR_URL=$OPTARG + ;; + \?|:) + usage + ;; + esac + done +} + +function envsetup() { + # check to see if user specified a Solr URL + # ... otherwise use the default + if [[ -z $SOLR_URL ]]; then + SOLR_URL=$DEF_SOLR_URL + fi + + # check to see if user specified a spiders pattern file + # ... otherwise use the default + if [[ -z $SPIDERS_PATTERN_FILE ]]; then + SPIDERS_PATTERN_FILE=$DEF_SPIDERS_PATTERN_FILE + fi + + # check to see if user specified Solr statistics shards + # ... otherwise use the default + if [[ -z $STATISTICS_SHARD ]]; then + STATISTICS_SHARD=$DEF_STATISTICS_SHARD + fi +} + +# pass the shell's argument array to the parsing function +parse_options $ARGS + +# set up the defaults +envsetup + +[[ $DEBUG ]] && echo "(DEBUG) Using spiders pattern file: $SPIDERS_PATTERN_FILE" + + +# Make a temporary copy of the spider file so we can do pattern replacement +# inside it with sed rather than using stdout from sed and having to deal +# with spaces and newlines in bash. +SPIDERS_PATTERN_FILE_TEMP=$(mktemp) +cp "$SPIDERS_PATTERN_FILE" "$SPIDERS_PATTERN_FILE_TEMP" + +# Read list of spider user agents from the patterns file, converting PCRE-style +# regular expressions to a format that is easier to deal with in bash (spaces!) +# and that Solr supports (ie, patterns are anchored by ^ and $ implicitly, and +# some character types like \d are not supported). +# +# See: https://1opensourcelover.wordpress.com/2013/09/29/solr-regex-tutorial/ +# +# For now this seems to be enough: +# - Replace \s with a literal space +# - Replace \d with [0-9] character class +# - Unescape dashes +# - Escape @ +# +sed -i -e 's/\\s/ /g' -e 's/\\d/[0-9]/g' -e 's/\\-/-/g' -e 's/@/\\@/g' $SPIDERS_PATTERN_FILE_TEMP + +# Start a tally of bot hits so we can report the total at the end +BOT_HITS=0 + +while read -r spider; do + # Save the original pattern so we can inform the user later + original_spider=$spider + + # Skip patterns that contain a plus or percent sign (+ or %) because they + # are tricky to deal with in Solr. For some reason escaping them seems to + # work for searches, but not for deletes. I don't have time to figure it + # out. + if [[ $spider =~ [%\+] ]]; then + [[ $DEBUG ]] && echo "(DEBUG) Skipping spider: $original_spider" + continue + fi + + + unset has_beginning_anchor + unset has_end_anchor + + # Remove ^ at the beginning because it is implied in Solr's regex search + if [[ $spider =~ ^\^ ]]; then + spider=$(echo $spider | sed -e 's/^\^//') + + # Record that this spider's original user agent pattern had a ^ + has_beginning_anchor=yes + fi + + # Remove $ at the end because it is implied in Solr's regex search + if [[ $spider =~ \$ ]]; then + spider=$(echo $spider | sed -e 's/\$$//') + + # Record that this spider's original user agent pattern had a $ + has_end_anchor=yes + fi + + # If the original pattern did NOT have a beginning anchor (^), then add a + # wildcard at the beginning. + if [[ -z $has_beginning_anchor ]]; then + spider=".*$spider" + fi + + # If the original pattern did NOT have an ending enchor ($), then add a + # wildcard at the end. + if [[ -z $has_end_anchor ]]; then + spider="$spider.*" + fi + + [[ $DEBUG ]] && echo "(DEBUG) Checking for hits from spider: $original_spider" + + # Check for hits from this spider in Solr and save results into a variable, + # setting a custom curl output format so I can get the HTTP status code and + # Solr response in one request, then tease them out later. + solr_result=$(curl -s -w "http_code=%{http_code}" "$SOLR_URL/$STATISTICS_SHARD/select" -d "q=userAgent:/$spider/&rows=0") + + http_code=$(echo $solr_result | grep -o -E 'http_code=[0-9]+' | awk -F= '{print $2}') + + # Check the Solr HTTP response code and skip spider if not successful + if [[ $http_code -ne 200 ]]; then + [[ $DEBUG ]] && echo "(DEBUG) Solr query returned HTTP $http_code, skipping $original_spider." + + continue + fi + + # lazy extraction of Solr numFound (relies on sed -E for extended regex) + numFound=$(echo $solr_result | sed -E 's/\s+http_code=[0-9]+//' | xmllint --format - | grep numFound | sed -E 's/^.*numFound="([0-9]+)".*$/\1/') + + if [[ numFound -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo "Purging $numFound hits from $original_spider in $STATISTICS_SHARD" + + # Purge the hits and soft commit + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "userAgent:/$spider/" > /dev/null 2>&1 + else + echo "Found $numFound hits from $original_spider in $STATISTICS_SHARD" + fi + + BOT_HITS=$((BOT_HITS+numFound)) + fi +done < "$SPIDERS_PATTERN_FILE_TEMP" + +if [[ $BOT_HITS -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo + echo "Total number of bot hits purged: $BOT_HITS" + + # Hard commit after we're done processing all spiders + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?commit=true" > /dev/null 2>&1 + else + echo + echo "Total number of hits from bots: $BOT_HITS" + fi +fi + +if [[ -f "$SPIDERS_PATTERN_FILE_TEMP" ]]; then + rm "$SPIDERS_PATTERN_FILE_TEMP" +fi + +# vim: set expandtab:ts=4:sw=4:bs=2 diff --git a/ilri/check-spider-ip-hits.sh b/ilri/check-spider-ip-hits.sh new file mode 100755 index 000000000000..f990c0c310cc --- /dev/null +++ b/ilri/check-spider-ip-hits.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +# +# check-spider-ip-hits.sh v0.0.2 +# +# Copyright (C) 2020 Alan Orth +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Exit on first error +set -o errexit + +# defaults +readonly DEF_SPIDER_IPS_FILE=/dspace/config/spiders/agents/example +readonly DEF_SOLR_URL=http://localhost:8081/solr +readonly DEF_STATISTICS_SHARD=statistics + +###### + +readonly PROGNAME=$(basename $0) +readonly ARGS="$@" + +function usage() { + cat <<-EOF +Usage: $PROGNAME [-d] [-f $DEF_SPIDER_IPS_FILE] [-p] [-s $DEF_STATISTICS_SHARD] [-u $DEF_SOLR_URL] + +Optional arguments: + -d: print debug messages + -f: path to file containing spider IP addresses (default: $DEF_SPIDER_IPS_FILE) + -p: purge statistics that match spider user agents + -s: Solr statistics shard, for example statistics or statistics-2018¹ (default: $DEF_STATISTICS_SHARD) + -u: URL to Solr (default: $DEF_SOLR_URL) + +Written by: Alan Orth + +¹ If your statistics core has been split into yearly "shards" by DSpace's stats-util you need to search each shard separately. +EOF + + exit 0 +} + +function parse_options() { + while getopts ":df:ps:u:" opt; do + case $opt in + d) + DEBUG=yes + ;; + f) + SPIDER_IPS_FILE=$OPTARG + + if ! [[ -r "$SPIDER_IPS_FILE" ]]; then + echo "(ERROR) Spider IPs file \"$SPIDER_IPS_FILE\" doesn't exist." + + exit 1 + fi + ;; + p) + PURGE_SPIDER_HITS=yes + ;; + s) + STATISTICS_SHARD=$OPTARG + ;; + u) + # make sure -s is passed something like a URL + if ! [[ "$OPTARG" =~ ^https?://.*$ ]]; then + usage + fi + + SOLR_URL=$OPTARG + ;; + \?|:) + usage + ;; + esac + done +} + +function envsetup() { + # check to see if user specified a Solr URL + # ... otherwise use the default + if [[ -z $SOLR_URL ]]; then + SOLR_URL=$DEF_SOLR_URL + fi + + # check to see if user specified a spiders pattern file + # ... otherwise use the default + if [[ -z $SPIDER_IPS_FILE ]]; then + SPIDER_IPS_FILE=$DEF_SPIDER_IPS_FILE + fi + + # check to see if user specified Solr statistics shards + # ... otherwise use the default + if [[ -z $STATISTICS_SHARD ]]; then + STATISTICS_SHARD=$DEF_STATISTICS_SHARD + fi +} + +# pass the shell's argument array to the parsing function +parse_options $ARGS + +# set up the defaults +envsetup + +[[ $DEBUG ]] && echo "(DEBUG) Using spider IPs file: $SPIDER_IPS_FILE" + +# Read list of spider IPs, escaping colons in IPv6 address and skipping blank +# lines and comments (#). +IPS=$(sed -e 's/\:/\\:/g' $SPIDER_IPS_FILE | grep -v -E '^$' | grep -v '#') + +# Start a tally of bot hits so we can report the total at the end +BOT_HITS=0 + +for ip in $IPS; do + [[ $DEBUG ]] && echo "(DEBUG) Checking for hits from spider IP: $ip" + + # Check for hits from this spider in Solr and save results into a variable, + # setting a custom curl output format so I can get the HTTP status code and + # Solr response in one request, then tease them out later. + solr_result=$(curl -s -w "http_code=%{http_code}" "$SOLR_URL/$STATISTICS_SHARD/select" -d "q=ip:/$ip/&rows=0") + + http_code=$(echo $solr_result | grep -o -E 'http_code=[0-9]+' | awk -F= '{print $2}') + + # Check the Solr HTTP response code and skip spider if not successful + if [[ $http_code -ne 200 ]]; then + [[ $DEBUG ]] && echo "(DEBUG) Solr query returned HTTP $http_code, skipping $ip." + + continue + fi + + # lazy extraction of Solr numFound (relies on sed -E for extended regex) + numFound=$(echo $solr_result | sed -E 's/\s+http_code=[0-9]+//' | xmllint --format - | grep numFound | sed -E 's/^.*numFound="([0-9]+)".*$/\1/') + + if [[ numFound -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo "Purging $numFound hits from $ip in $STATISTICS_SHARD" + + # Purge the hits and soft commit + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "ip:/$ip/" > /dev/null 2>&1 + else + echo "Found $numFound hits from $ip in $STATISTICS_SHARD" + fi + + BOT_HITS=$((BOT_HITS+numFound)) + fi +done + +if [[ $BOT_HITS -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo + echo "Total number of bot hits purged: $BOT_HITS" + + # Hard commit after we're done processing all spiders + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?commit=true" > /dev/null 2>&1 + else + echo + echo "Total number of hits from bots: $BOT_HITS" + fi +fi + +# vim: set expandtab:ts=4:sw=4:bs=2 diff --git a/ilri/check_duplicates.py b/ilri/check_duplicates.py new file mode 100755 index 000000000000..ed5c1953e99b --- /dev/null +++ b/ilri/check_duplicates.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +# check-duplicates.py 0.4.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a CSV with at least four columns containing id, item titles, types,and +# issue dates to be checked against the DSpace PostgreSQL database for potential +# duplicates. The database must have the trgm extention created in order for +# this to work: +# +# localhost/database= > CREATE EXTENSION pg_trgm; +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install psycopg colorama +# +# See: https://www.psycopg.org/psycopg3/docs + +import argparse +import csv +import signal +import sys +from datetime import datetime + +import util +from colorama import Fore +from psycopg import sql + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Compare the item's date issued to that of the potential duplicate +def compare_date_strings(item_date, duplicate_date): + # Split the item date on "-" to see what format we need to + # use to create the datetime object. + if len(item_date.split("-")) == 1: + date1 = datetime.strptime(item_date, "%Y") + elif len(item_date.split("-")) == 2: + date1 = datetime.strptime(item_date, "%Y-%m") + elif len(item_date.split("-")) == 3: + date1 = datetime.strptime(item_date, "%Y-%m-%d") + + # Do the same for the potential duplicate's date + if len(duplicate_date.split("-")) == 1: + date2 = datetime.strptime(duplicate_date, "%Y") + elif len(duplicate_date.split("-")) == 2: + date2 = datetime.strptime(duplicate_date, "%Y-%m") + elif len(duplicate_date.split("-")) == 3: + date2 = datetime.strptime(duplicate_date, "%Y-%m-%d") + + # Return the difference between the two dates. Doesn't matter which comes + # first here because we are getting the absolute to avoid negative days! + return abs((date1 - date2).days) + + +parser = argparse.ArgumentParser(description="Find duplicate titles.") +parser.add_argument( + "-i", + "--input-file", + help="Path to input CSV file.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "--days-threshold", + type=float, + help="Threshold for difference of days between item and potential duplicates (default 365).", + default=365, +) +parser.add_argument( + "-o", + "--output-file", + help="Path to output CSV file.", + required=True, + type=argparse.FileType("w"), +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument( + "-s", + "--similarity-threshold", + type=float, + help="Similarity threshold, between 0.0 and 1.0 (default 0.6).", + default=0.6, +) +args = parser.parse_args() + +# Column names in the CSV +id_column_name = "id" +criteria1_column_name = "dc.title" +criteria2_column_name = "dcterms.type" +criteria3_column_name = "dcterms.issued" + +# open the CSV +reader = csv.DictReader(args.input_file) + +# check if the title column exists in the CSV +if criteria1_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria one column "{criteria1_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the type column exists in the CSV +if criteria2_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria two column "{criteria2_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the date issued column exists in the CSV +if criteria3_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria three column "{criteria3_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) + + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +# set the connection to read only since we are not writing anything +conn.read_only = True + +cursor = conn.cursor() + +# Field IDs from the metadatafieldregistry table +criteria1_field_id = util.field_name_to_field_id(cursor, criteria1_column_name) +criteria2_field_id = util.field_name_to_field_id(cursor, criteria2_column_name) +criteria3_field_id = util.field_name_to_field_id(cursor, criteria3_column_name) + +with conn: + # Make sure the pg_trgm extension is installed in the current database + cursor.execute("SELECT extname FROM pg_extension WHERE extname='pg_trgm'") + if cursor.rowcount == 0: + sys.stderr.write( + Fore.RED + + f"Database '{args.database_name}' is missing the 'pg_trgm' extension.\n" + + Fore.RESET + ) + sys.exit(1) + + # Set the similarity threshold for this session. PostgreSQL default is 0.3, + # which leads to lots of false positives for this use case. Note that the + # weird syntax here is because of SET not working in in psycopg3. + # + # See: https://www.psycopg.org/psycopg3/docs/basic/from_pg2.html#server-side-binding + cursor.execute( + sql.SQL( + "SET pg_trgm.similarity_threshold = {}".format(args.similarity_threshold) + ) + ) + + # Fields for the output CSV + fieldnames = [ + "id", + "Your Title", + "Their Title", + "Similarity", + "Your Date", + "Their Date", + "Handle", + ] + + # Write the CSV header + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for input_row in reader: + # Check for items with similarity to criteria one (title). Note that + # this is the fastest variation of this query: using the similarity + # operator (%, written below twice for escaping) instead of the sim- + # larity function, as indexes are bound to operators, not functions! + # Also, if I leave off the item query it takes twice as long! + sql = "SELECT text_value, dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value %% %s" + + cursor.execute( + sql, + ( + criteria1_field_id, + input_row[criteria1_column_name], + ), + ) + + # If we have any similarity in criteria one (title), then check type + if cursor.rowcount > 0: + duplicate_titles = cursor.fetchall() + + # Iterate over duplicate titles to check their types + for duplicate_title in duplicate_titles: + dspace_object_id = duplicate_title[1] + + # Check type of this duplicate title, also making sure that + # the item is in the archive and not withdrawn. + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s AND M.text_value=%s AND I.in_archive='t' AND I.withdrawn='f'" + + cursor.execute( + sql, + ( + dspace_object_id, + criteria2_field_id, + input_row[criteria2_column_name], + ), + ) + + # This means we didn't match on item type, so let's skip to + # the next item title. + if cursor.rowcount == 0: + continue + + # Get the date of this potential duplicate. (If we are here + # then we already confirmed above that the item is both in + # the archive and not withdrawn, so we don't need to check + # that again). + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s" + + cursor.execute( + sql, + (dspace_object_id, criteria3_field_id), + ) + + # This means that we successfully extracted the date for the + # potential duplicate. + if cursor.rowcount > 0: + duplicate_item_date = cursor.fetchone()[0] + # If rowcount is not > 0 then the potential duplicate does + # not have a date and we have bigger problems. Skip! + else: + continue + + # Get the number of days between the issue dates + days_difference = compare_date_strings( + input_row[criteria3_column_name], duplicate_item_date + ) + + # Items with a similar title, same type, and issue dates + # within a year or so are likely duplicates. Otherwise, + # it's possible that items with a similar name could be + # like Annual Reports where most metadata is the same + # except the date issued. + if days_difference <= args.days_threshold: + # By this point if we have any matches then they are + # similar in title and have an exact match for the type + # and an issue date within the threshold. Now we are + # reasonably sure it's a duplicate, so get the handle. + sql = "SELECT handle FROM handle WHERE resource_id=%s" + cursor.execute(sql, (dspace_object_id,)) + try: + handle = f"https://hdl.handle.net/{cursor.fetchone()[0]}" + except TypeError: + # If we get here then there is no handle for this + # item's UUID. Could be that the item was deleted? + continue + + sys.stdout.write( + f"{Fore.YELLOW}Found potential duplicate:{Fore.RESET}\n" + ) + + # https://alexklibisz.com/2022/02/18/optimizing-postgres-trigram-search.html + sql = "SELECT round(similarity(%s, %s)::numeric, 3)" + cursor.execute( + sql, (input_row[criteria1_column_name], duplicate_title[0]) + ) + trgm_similarity = cursor.fetchone()[0] + + sys.stdout.write( + f"{Fore.YELLOW}→ Title:{Fore.RESET} {input_row[criteria1_column_name]} ({trgm_similarity})\n" + ) + sys.stdout.write(f"{Fore.YELLOW}→ Handle:{Fore.RESET} {handle}\n\n") + + output_row = { + "id": input_row[id_column_name], + "Your Title": input_row[criteria1_column_name], + "Their Title": duplicate_title[0], + "Similarity": trgm_similarity, + "Your Date": input_row[criteria3_column_name], + "Their Date": duplicate_item_date, + "Handle": handle, + } + + writer.writerow(output_row) + + # close output file before we exit + args.output_file.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/check_duplicates_fuzzy.py b/ilri/check_duplicates_fuzzy.py new file mode 100755 index 000000000000..afcfd879e382 --- /dev/null +++ b/ilri/check_duplicates_fuzzy.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 + +# check-duplicates.py 0.4.0 +# +# Copyright Alan Orth. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# --- +# +# Expects a CSV with at least four columns containing id, item titles, types,and +# issue dates to be checked against the DSpace PostgreSQL database for potential +# duplicates. The database must have the trgm extention created in order for +# this to work: +# +# localhost/database= > CREATE EXTENSION pg_trgm; +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install psycopg2-binary colorama +# +# See: http://initd.org/psycopg +# See: http://initd.org/psycopg/docs/usage.html#with-statement +# See: http://initd.org/psycopg/docs/faq.html#best-practices + +import argparse +import csv +import signal +import sys +from datetime import datetime + +import psycopg2 +from colorama import Fore + +# Column names in the CSV +id_column_name = "id" +criteria1_column_name = "dc.title" +criteria2_column_name = "dcterms.type" +criteria3_column_name = "dcterms.issued" +# Field IDs from the metadatafieldregistry table +criteria1_field_id = 64 +criteria2_field_id = 191 +criteria3_field_id = 170 + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Compare the item's date issued to that of the potential duplicate +def compare_date_strings(item_date, duplicate_date): + # Split the item date on "-" to see what format we need to + # use to create the datetime object. + if len(item_date.split("-")) == 1: + date1 = datetime.strptime(item_date, "%Y") + elif len(item_date.split("-")) == 2: + date1 = datetime.strptime(item_date, "%Y-%m") + elif len(item_date.split("-")) == 3: + date1 = datetime.strptime(item_date, "%Y-%m-%d") + + # Do the same for the potential duplicate's date + if len(duplicate_date.split("-")) == 1: + date2 = datetime.strptime(duplicate_date, "%Y") + elif len(duplicate_date.split("-")) == 2: + date2 = datetime.strptime(duplicate_date, "%Y-%m") + elif len(duplicate_date.split("-")) == 3: + date2 = datetime.strptime(duplicate_date, "%Y-%m-%d") + + # Return the difference between the two dates. Doesn't matter which comes + # first here because we are getting the absolute to avoid negative days! + return abs((date1 - date2).days) + + +parser = argparse.ArgumentParser(description="Find duplicate titles.") +parser.add_argument( + "-i", + "--input-file", + help="Path to input CSV file.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "--days-threshold", + type=float, + help="Threshold for difference of days between item and potential duplicates (default 365).", + default=365, +) +parser.add_argument( + "-o", + "--output-file", + help="Path to output CSV file.", + required=True, + type=argparse.FileType("w"), +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument( + "-s", + "--similarity-threshold", + type=float, + help="Similarity threshold, between 0.0 and 1.0 (default 0.6).", + default=0.6, +) +args = parser.parse_args() + +# open the CSV +reader = csv.DictReader(args.input_file) + +# check if the title column exists in the CSV +if criteria1_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria one column "{criteria1_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the type column exists in the CSV +if criteria2_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria two column "{criteria2_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the date issued column exists in the CSV +if criteria3_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria three column "{criteria3_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) + + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +try: + conn = psycopg2.connect( + f"dbname={args.database_name} user={args.database_user} password={args.database_pass} host=localhost" + ) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Connected to database.\n" + Fore.RESET) +except psycopg2.OperationalError: + sys.stderr.write(Fore.RED + "Could not connect to database.\n" + Fore.RESET) + sys.exit(1) + +with conn: + # cursor will be closed after this block exits + # see: http://initd.org/psycopg/docs/usage.html#with-statement + with conn.cursor() as cursor: + # Make sure the pg_trgm extension is installed in the current database + cursor.execute("SELECT extname FROM pg_extension WHERE extname='pg_trgm'") + if cursor.rowcount == 0: + sys.stderr.write( + Fore.RED + + f"Database '{args.database_name}' is missing the 'pg_trgm' extension.\n" + + Fore.RESET + ) + sys.exit(1) + + # Set the similarity threshold for this session. PostgreSQL default is + # 0.3, which leads to lots of false positives for this use case. + cursor.execute( + "SET pg_trgm.similarity_threshold = %s", (args.similarity_threshold,) + ) + + # Fields for the output CSV + fieldnames = [ + "id", + "Your Title", + "Their Title", + "Your Date", + "Their Date", + "Handle", + ] + + # Write the CSV header + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for input_row in reader: + # Check for items with similarity to criteria one (title). Note that + # this is the fastest variation of this query: using the similarity + # operator (%, written below twice for escaping) instead of the sim- + # larity function, as indexes are bound to operators, not functions! + # Also, if I leave off the item query it takes twice as long! + sql = "SELECT text_value, dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=%s AND LEVENSHTEIN_LESS_EQUAL(LOWER(%s), LEFT(LOWER(text_value), 255), 3) <= 3" + + cursor.execute( + sql, + ( + criteria1_field_id, + input_row[criteria1_column_name], + ), + ) + + # If we have any similarity in criteria one (title), then check type + if cursor.rowcount > 0: + duplicate_titles = cursor.fetchall() + + # Iterate over duplicate titles to check their types + for duplicate_title in duplicate_titles: + dspace_object_id = duplicate_title[1] + + # Check type of this duplicate title, also making sure that + # the item is in the archive and not withdrawn. + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s AND M.text_value=%s AND I.in_archive='t' AND I.withdrawn='f'" + + cursor.execute( + sql, + ( + dspace_object_id, + criteria2_field_id, + input_row[criteria2_column_name], + ), + ) + + # This means we didn't match on item type, so let's skip to + # the next item title. + if cursor.rowcount == 0: + continue + + # Get the date of this potential duplicate. (If we are here + # then we already confirmed above that the item is both in + # the archive and not withdrawn, so we don't need to check + # that again). + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s" + + cursor.execute( + sql, + (dspace_object_id, criteria3_field_id), + ) + + # This means that we successfully extracted the date for the + # potential duplicate. + if cursor.rowcount > 0: + duplicate_item_date = cursor.fetchone()[0] + # If rowcount is not > 0 then the potential duplicate does + # not have a date and we have bigger problems. Skip! + else: + continue + + # Get the number of days between the issue dates + days_difference = compare_date_strings( + input_row[criteria3_column_name], duplicate_item_date + ) + + # Items with a similar title, same type, and issue dates + # within a year or so are likely duplicates. Otherwise, + # it's possible that items with a similar name could be + # like Annual Reports where most metadata is the same + # except the date issued. + if days_difference <= args.days_threshold: + # By this point if we have any matches then they are + # similar in title and have an exact match for the type + # and an issue date within the threshold. Now we are + # reasonably sure it's a duplicate, so get the handle. + sql = "SELECT handle FROM handle WHERE resource_id=%s" + cursor.execute(sql, (dspace_object_id,)) + handle = f"https://hdl.handle.net/{cursor.fetchone()[0]}" + + sys.stdout.write( + f"{Fore.YELLOW}Found potential duplicate:{Fore.RESET}\n" + ) + sys.stdout.write( + f"{Fore.YELLOW}→ Title:{Fore.RESET} {input_row[criteria1_column_name]}\n" + ) + sys.stdout.write( + f"{Fore.YELLOW}→ Handle:{Fore.RESET} {handle}\n\n" + ) + + output_row = { + "id": input_row[id_column_name], + "Your Title": input_row[criteria1_column_name], + "Their Title": duplicate_title[0], + "Your Date": input_row[criteria3_column_name], + "Their Date": duplicate_item_date, + "Handle": handle, + } + + writer.writerow(output_row) + + # close output file before we exit + args.output_file.close() + + +# close database connection before we exit +conn.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/countries_to_csv.py b/ilri/countries_to_csv.py new file mode 100755 index 000000000000..485f88f82d02 --- /dev/null +++ b/ilri/countries_to_csv.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# +# countries-to-csv.py v0.0.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Read a list of countries and export a CSV with their ISO 3166-1 Alpha-2 codes +# and names. Run like this: +# +# $ countries-to-csv.py input-file.txt output-file.csv +# +# Expects input file to have one country per line. Extract countries from the +# DSpace input-forms.xml with xmllint: +# +# $ xmllint --xpath '//value-pairs[@value-pairs-name="countrylist"]/pair/stored-value/node()' dspace/config/input-forms.xml > /tmp/cgspace-countries.txt + +import csv +import sys + +import pycountry + +try: + # Quick handling of command line args, no time to implement argparse. + input_filename = sys.argv[1] + output_filename = sys.argv[2] +except IndexError: + print("Please specify input and output files.") + + exit(1) + +with open(input_filename, "r") as countries_in: + with open(output_filename, mode="w") as countries_out: + # Prepare the CSV + fieldnames = ["alpha2", "Name"] + csv_writer = csv.DictWriter(countries_out, fieldnames=fieldnames) + csv_writer.writeheader() + + for line in countries_in.readlines(): + print(f"Looking up {line.strip()}...") + + country_result = pycountry.countries.get(name=line.strip()) + + # Check if we found an exact match first + if country_result is not None: + country_alpha2 = country_result.alpha_2 + country_name = line.strip() + else: + # Can't find a match so just save the name with no alpha2. Note + # that we could try with a fuzzy search before giving up, but I + # have had some strange issues with fuzzy search in the past. + # + # See: https://github.com/flyingcircusio/pycountry/issues/115 + country_alpha2 = "" + country_name = line.strip() + + csv_writer.writerow({"alpha2": country_alpha2, "Name": country_name}) diff --git a/ilri/create-value-pairs.sh b/ilri/create-value-pairs.sh new file mode 100755 index 000000000000..ae70dd057926 --- /dev/null +++ b/ilri/create-value-pairs.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# +# ./create-value-pairs.sh terms.txt terms-name + +printf '\n' $2 + +while read -r line +do + printf ' \n' + printf ' %s\n' "$line" + printf ' %s\n' "$line" + printf ' \n' +done < $1 + +printf '' diff --git a/ilri/crossref_doi_lookup.py b/ilri/crossref_doi_lookup.py new file mode 100755 index 000000000000..b11a67af0c5a --- /dev/null +++ b/ilri/crossref_doi_lookup.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +# +# crossref-doi-lookup.py 0.2.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Crossref API for DOIs read from a text file (one per line). +# The Crossref database has a wealth of information about DOIs, for example the +# issue date, license, journal title, item type, authors, funders, etc. This +# information can be used to improve metadata in other systems. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import logging +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +import util +from colorama import Fore + +# Create a local logger instance for this module. We don't do any configuration +# because this module might be used elsewhere that will have its own logging +# configuration. +logger = logging.getLogger(__name__) + + +# Crossref uses dates with single-digit month and day parts, so we need to pad +# them with zeros if they are less than 10. +def fix_crossref_date(crossref_date: list) -> str: + if len(crossref_date) == 1: + issued = crossref_date[0] + elif len(crossref_date) == 2: + if crossref_date[1] < 10: + crossref_date_month = f"0{crossref_date[1]}" + else: + crossref_date_month = crossref_date[1] + + issued = f"{crossref_date[0]}-{crossref_date_month}" + elif len(crossref_date) == 3: + if crossref_date[1] < 10: + crossref_date_month = f"0{crossref_date[1]}" + else: + crossref_date_month = crossref_date[1] + + if crossref_date[2] < 10: + crossref_date_day = f"0{crossref_date[2]}" + else: + crossref_date_day = crossref_date[2] + + issued = f"{crossref_date[0]}-{crossref_date_month}-{crossref_date_day}" + else: + issued = "" + + return issued + + +def resolve_doi(doi: str) -> None: + logger.info(Fore.GREEN + f"Looking up DOI: {doi}" + Fore.RESET) + + # First, check if this DOI is registered at Crossref + request_url = f"https://api.crossref.org/works/{doi}/agency" + request_params = {"mailto": args.email} + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + logger.error(Fore.RED + "Connection error." + Fore.RESET) + + sys.exit(1) + + # HTTP 404 here means the DOI is not registered at Crossref + if not request.ok: + logger.debug( + Fore.YELLOW + + f"> DOI not in Crossref (cached: {request.from_cache})" + + Fore.RESET + ) + + return + + data = request.json() + + # Only proceed if this DOI registration agency is Crossref + match data["message"]["agency"]["label"]: + case "DataCite": + logger.debug( + Fore.YELLOW + + f"> Skipping DOI registered to DataCite (cached: {request.from_cache})" + + Fore.RESET + ) + + return + case "Public": + logger.debug( + Fore.YELLOW + + f'> Skipping DOI registered to "Public" (cached: {request.from_cache})' + + Fore.RESET + ) + + return + case "Crossref": + pass + + # Fetch the metadata for this DOI + request_url = f"https://api.crossref.org/works/{doi}" + request_params = {"mailto": args.email} + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + logger.error(Fore.RED + "Connection error." + Fore.RESET) + + if not request.ok: + return + + logger.debug( + Fore.YELLOW + f"> DOI in Crossref (cached: {request.from_cache})" + Fore.RESET + ) + + data = request.json() + + # I don't know why title is an array of strings, but let's just get + # the first one. + try: + title = data["message"]["title"][0] + except IndexError: + title = "" + + # Create an empty list to keep our authors + authors = list() + affiliations = list() + + try: + for author in data["message"]["author"]: + # Some authors have no given name in Crossref + try: + # Crossref given name is often initials like "S. M." + # and we don't want that space! + author_given_name = author["given"].replace(". ", ".") + except KeyError: + author_given_name = None + + # Some authors have no family name in Crossref + try: + author_family_name = author["family"] + except KeyError: + author_family_name = None + + # Naive construction of "Last, First Initials" when we have + # both of them. + if author_family_name and author_given_name: + authors.append(f"{author_family_name}, {author_given_name}") + # Otherwise we need to make do with only the family name + elif author_family_name and author_given_name is None: + authors.append(author_family_name) + # And sometimes we need to make do with only the given name + elif author_given_name and author_family_name is None: + authors.append(author_given_name) + + # Get any affiliations from the authors (not all have) + try: + for affiliation in author["affiliation"]: + if affiliation["name"] not in affiliations: + affiliations.append(affiliation["name"]) + # Not sure what we can except here + except: + pass + + # Believe it or not some items on Crossref have no author (doesn't + # mean the DOI itself won't, though). + # + # See: https://api.crossref.org/works/10.1638/2018-0110 + # See: https://doi.org/10.1638/2018-0110 + except KeyError: + authors = "" + + # Create an empty list to keep our funders + funders = list() + + try: + for funder in data["message"]["funder"]: + if funder["name"] not in funders: + funders.append(funder["name"]) + except KeyError: + pass + + # Get the abstract if it exists + try: + abstract = data["message"]["abstract"] + except KeyError: + abstract = "" + + try: + journal = data["message"]["container-title"][0] + except IndexError: + journal = "" + + # Create an empty list to hold ISSNs, as there could be more than one + issns = list() + + # Get the ISSN. For journal articles there is often a print ISSN and + # an electric ISSN. + try: + for issn in data["message"]["ISSN"]: + issns.append(issn) + except KeyError: + issns = "" + + # Create an empty list to hold ISBNs, as there could be more than one + isbns = list() + + # Get the ISBN. For books and book chapters there is often a print + # ISBN and an electric ISBN. + try: + for isbn in data["message"]["isbn-type"]: + isbns.append(isbn["value"]) + except KeyError: + isbns = "" + + try: + publisher = data["message"]["publisher"] + except KeyError: + publisher = "" + + try: + volume = data["message"]["volume"] + except KeyError: + volume = "" + + try: + issue = data["message"]["issue"] + except KeyError: + issue = "" + + try: + page = data["message"]["page"] + except KeyError: + page = "" + + try: + item_type = data["message"]["type"] + except KeyError: + item_type = "" + + subjects = list() + + # Get the subjects. Still not sure if these are useful. We should + # check against AGROVOC before importing. + try: + for subject in data["message"]["subject"]: + subjects.append(subject) + except KeyError: + subjects = "" + + # It appears that *all* DOIs on Crossref have an "issued" date. This + # is the earliest of the print and online publishing dates. For now + # I will capture this so I can explore its implications and relation + # to other dates with real items in the repository. + # + # See: https://github.com/CrossRef/rest-api-doc/blob/master/api_format.md + issued = fix_crossref_date(data["message"]["issued"]["date-parts"][0]) + + # Date on which the work was published in print. Apparently not all + # DOIs have this so we need to try/except. Also note that there is + # a similar date in ["journal-issue"]["published-print"], but in my + # experience it is the same as this one 99% of the time when it is + # present (that's in 10,000 DOIs I checked in 2023-02). + try: + published_print = fix_crossref_date( + data["message"]["published-print"]["date-parts"][0] + ) + except KeyError: + published_print = "" + + # Date on which the work was published online. Note again that there + # is also ["journal-issue"]["published-online"], but in my experience + # it is only present ~33% of the time, and is only 50% the same as + # published-online. For now I'm not sure what to make of that, so I + # will not use it. + try: + published_online = fix_crossref_date( + data["message"]["published-online"]["date-parts"][0] + ) + except KeyError: + published_online = "" + + # Not all items have licenses, and some have multiple licenses. We + # will check for licenses in the order we prefer them: am, vor, tdm, + # and unspecified. These correspond to: accepted manuscript, version + # of record, text and data mining, and unspecified. I'm curious if + # there is *ever* a case where we would want the tdm license...? Can + # these ever be CC if the others are missing? + doi_licenses = {} + try: + for doi_license in data["message"]["license"]: + content_version = doi_license["content-version"] + doi_licenses[content_version] = doi_license["URL"] + + if "am" in doi_licenses: + license_url = f'am: {doi_licenses["am"]}' + elif "vor" in doi_licenses: + license_url = f'vor: {doi_licenses["vor"]}' + elif "tdm" in doi_licenses: + license_url = f'tdm: {doi_licenses["tdm"]}' + else: + license_url = f'unspecified: {doi_licenses["unspecified"]}' + except KeyError: + license_url = "" + + writer.writerow( + { + "title": title, + "abstract": abstract, + "authors": "||".join(authors), + "affiliations": "||".join(affiliations), + "funders": "||".join(funders), + "doi": f"https://doi.org/{doi}", + "journal": journal, + "issn": "||".join(issns), + "isbn": "||".join(isbns), + "publisher": publisher, + "volume": volume, + "issue": issue, + "page": page, + "type": item_type, + "issued": issued, + "published_print": published_print, + "published_online": published_online, + "license": license_url, + "subjects": "||".join(subjects), + } + ) + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API for metadata about DOIs." +) +parser.add_argument( + "-e", + "--email", + required=True, + help="Contact email to use in API requests so Crossref is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing DOIs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Since we're running interactively we can set the preferred log format for +# the logging module during this invocation. +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# install a transparent requests cache +expire_after = timedelta(days=30) +requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) +) +# prune old cache entries +requests_cache.delete() + +# Write the CSV header before starting +if args.output_file: + fieldnames = [ + "title", + "abstract", + "authors", + "affiliations", + "funders", + "doi", + "journal", + "issn", + "isbn", + "publisher", + "volume", + "issue", + "page", + "type", + "issued", + "published_print", + "published_online", + "license", + "subjects", + ] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + +# if the user specified an input file, get the DOIs from there +if args.input_file: + dois = util.read_dois_from_file(args.input_file) + for doi in dois: + resolve_doi(doi) + +# close output file before we exit +args.output_file.close() diff --git a/ilri/crossref_funders_lookup.py b/ilri/crossref_funders_lookup.py new file mode 100755 index 000000000000..d58162386bf4 --- /dev/null +++ b/ilri/crossref_funders_lookup.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# +# crossref-funders-lookup.py 0.3.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Crossref API for funders read from a text file. Text file +# should have one subject per line. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read funders from a text file, one per line +def read_funders_from_file(): + # initialize an empty list for funders + funders = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add subjects that aren't already present + if line not in funders: + funders.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_funders(funders) + + +def resolve_funders(funders): + fieldnames = ["funder", "match type", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with two weeks expiry because I don't + # know how often Crossref is updated. + expire_after = timedelta(days=14) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for funder in funders: + if args.debug: + sys.stderr.write(Fore.GREEN + f"Looking up funder: {funder}\n" + Fore.RESET) + + request_url = "https://api.crossref.org/funders" + request_params = {"query": funder} + + if args.email: + request_params.update(mailto=args.email) + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + sys.stderr.write(Fore.RED + "Connection error.\n" + Fore.RESET) + + if request.status_code == requests.codes.ok: + data = request.json() + + # assume no matches yet + matched = False + + # check if there are any results + if data["message"]["total-results"] > 0: + # iterate over each search result (item) + for item in data["message"]["items"]: + if item["name"].lower() == funder.lower() and not matched: + matched = True + + print( + f"Exact match for {funder} in Crossref (cached: {request.from_cache})" + ) + + writer.writerow( + { + "funder": funder, + "match type": "name", + "matched": "true", + } + ) + + # break out of the items loop because we have a match + break + + # check the alt-names for each search result + for altname in item["alt-names"]: + if altname.lower() == funder.lower() and not matched: + matched = True + + print( + f"Alt-name match for {funder} in Crossref (cached: {request.from_cache})" + ) + + writer.writerow( + { + "funder": funder, + "match type": "alt-name", + "matched": "true", + } + ) + + # break out of the alt-name loop because we have a match + break + + if data["message"]["total-results"] == 0 or not matched: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {funder} in Crossref (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow( + { + "funder": funder, + "match type": "", + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API to validate funders from a text file." +) +parser.add_argument( + "-e", + "--email", + help="Contact email to use in API requests so Crossref is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing funders to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the funders from there +if args.input_file: + read_funders_from_file() + +exit() diff --git a/ilri/crossref_issn_lookup.py b/ilri/crossref_issn_lookup.py new file mode 100755 index 000000000000..e336dcbee5e9 --- /dev/null +++ b/ilri/crossref_issn_lookup.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# +# crossref-issn-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Crossref API for journal titles using ISSNs read from a +# text file. The text file should have one ISSN per line. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read journals from a text file, one per line +def read_issns_from_file(): + # initialize an empty list for ISSNs + issns = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add ISSNs that aren't already present + if line not in issns: + issns.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_issns(issns) + + +def resolve_issns(issns): + fieldnames = ["issn", "journal title"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with two weeks expiry because I don't + # know how often Crossref is updated. + expire_after = timedelta(days=14) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for issn in issns: + if args.debug: + sys.stderr.write(Fore.GREEN + f"Looking up ISSN: {issn}\n" + Fore.RESET) + + request_url = f"https://api.crossref.org/journals/{issn}" + + try: + if args.email: + request_params = {"mailto": args.email} + + request = requests.get(request_url, params=request_params) + else: + request = requests.get(request_url, params=request_params) + + except requests.exceptions.ConnectionError: + sys.stderr.write(Fore.RED + "Connection error.\n" + Fore.RESET) + + # CrossRef responds 404 if a journal isn't found, so we check for an + # HTTP 2xx response here + if request.status_code == requests.codes.ok: + data = request.json() + + # sanity check if our ISSN is in CrossRef's response (do we + # need to check lowercase here?) + if issn in data["message"]["ISSN"]: + print( + f"Exact match for {issn} in Crossref (cached: {request.from_cache})" + ) + + writer.writerow( + {"issn": issn, "journal title": data["message"]["title"]} + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {issn} in Crossref (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow({"issn": issn, "journal title": ""}) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API to validate ISSNs from a text file." +) +parser.add_argument( + "-e", + "--email", + help="Contact email to use in API requests so Crossref is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing ISSNs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the ISSNs from there +if args.input_file: + read_issns_from_file() + +exit() diff --git a/ilri/delete_metadata_values.py b/ilri/delete_metadata_values.py new file mode 100755 index 000000000000..7e21afd2a9f9 --- /dev/null +++ b/ilri/delete_metadata_values.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# +# delete-metadata-values.py 1.2.4 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a CSV with one column of metadata values to delete, for example: +# +# cg.contributor.affiliation +# "some value to delete" +# +# $ ./delete-metadata-values.py -db database -u user -p password -f cg.contributor.affiliation -i file.csv +# +# This script is written for Python 3 and DSpace 6+ and requires several modules +# that you can install with pip (I recommend setting up a Python virtual env +# first): +# +# $ pip install psycopg colorama +# + +import argparse +import csv +import signal +import sys + +import util +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(0) + + +parser = argparse.ArgumentParser( + description="Delete metadata values in the DSpace SQL database." +) +parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", +) +parser.add_argument( + "-f", + "--from-field-name", + help="Name of column with values to be deleted", + required=True, +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +args = parser.parse_args() + +# open the CSV +reader = csv.DictReader(args.csv_file) + +# check if the from/to fields specified by the user exist in the CSV +if args.from_field_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + 'Specified field "{0}" does not exist in the CSV.\n'.format( + args.from_field_name + ) + + Fore.RESET + ) + sys.exit(1) + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +cursor = conn.cursor() + +for row in reader: + metadata_field_id = util.field_name_to_field_id(cursor, args.from_field_name) + + # Get item UUIDs for metadata values that will be updated + sql = "SELECT dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute(sql, (metadata_field_id, row[args.from_field_name])) + + if cursor.rowcount > 0: + if args.dry_run: + if not args.quiet: + print( + Fore.GREEN + + "Would delete {0} occurences of: {1}".format( + cursor.rowcount, row[args.from_field_name] + ) + + Fore.RESET + ) + + # Since this a dry run we can continue to the next replacement + continue + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + sql = "DELETE from metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute(sql, (metadata_field_id, row[args.from_field_name])) + + if cursor.rowcount > 0 and not args.quiet: + print( + Fore.GREEN + + "Deleted {0} occurences of: {1}".format( + cursor.rowcount, row[args.from_field_name] + ) + + Fore.RESET + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[0]) + + +# commit the changes when we are done +if not args.dry_run: + conn.commit() + +# close database connection before we exit +conn.close() + +# close the input file +args.csv_file.close() + +sys.exit(0) diff --git a/ilri/doi_to_handle.py b/ilri/doi_to_handle.py new file mode 100755 index 000000000000..89983df8f34d --- /dev/null +++ b/ilri/doi_to_handle.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# +# doi-to-handle.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# This script was written to produce a list of Handles from a list of DOIs. It +# reads a text file with DOIs (one per line) and looks in the local DSpace SQL +# database to find the Handle for any item with that DOI. We used it to target +# the Tweeting of certain items in order to get Altmetric to make the link be- +# tween the Handle and the DOI. +# +# This script is written for Python 3.6+. +# + +import argparse +import csv +import signal +import sys + +import util + + +def resolve_doi(dois): + # metadata_field_id for metadata values (from metadatafieldregistry and + # might differ from site to site). + title_metadata_field_id = 64 + handle_metadata_field_id = 25 + doi_metadata_field_id = 220 + + print(f"Looking up {doi} in database") + + cursor = conn.cursor() + + with conn.transaction(): + # make a temporary string we can use with the PostgreSQL regex + doi_string = f".*{doi}.*" + + # get the dspace_object_id for the item with this DOI + sql = "SELECT dspace_object_id FROM metadatavalue WHERE metadata_field_id=%s AND text_value ~* %s" + cursor.execute( + sql, + (doi_metadata_field_id, doi_string), + ) + + # make sure rowcount is exactly 1, because some DOIs are used + # multiple times and I ain't got time for that right now + if cursor.rowcount == 1 and not args.quiet: + dspace_object_id = cursor.fetchone()[0] + print(f"Found {doi}, DSpace object: {dspace_object_id}") + elif cursor.rowcount > 1 and not args.quiet: + print(f"Found multiple items for {doi}") + + return + else: + print(f"Not found: {doi}") + + return + + # get the title + sql = "SELECT text_value FROM metadatavalue WHERE metadata_field_id=%s AND dspace_object_id=%s" + cursor.execute(sql, (title_metadata_field_id, dspace_object_id)) + + if cursor.rowcount != 1: + print(f"Missing title for {doi}, skipping") + + return + + title = cursor.fetchone()[0] + + # get the handle + cursor.execute(sql, (handle_metadata_field_id, dspace_object_id)) + + if cursor.rowcount != 1: + print(f"Missing handle for {doi}, skipping") + + return + + handle = cursor.fetchone()[0] + + row = { + "title": title, + "handle": handle, + "doi": doi, + } + + writer.writerow(row) + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + # close database connection before we exit + conn.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query DSpace database for item metadata based on a list of DOIs in a text file." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument( + "-i", + "--input-file", + help="File name containing DOIs to resolve.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="File name to save CSV output.", + required=True, + type=argparse.FileType("w"), +) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +# Set this connection to be read only since we are not modifying the database +conn.read_only = True + +# field names for the CSV +fieldnames = ["title", "handle", "doi"] + +writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) +writer.writeheader() + +dois = util.read_dois_from_file(args.input_file) +for doi in dois: + resolve_doi(doi) + +# close output file before we exit +args.output_file.close() + +# close database connection before we exit +conn.close() + +exit() diff --git a/ilri/fix_initiative_mappings.py b/ilri/fix_initiative_mappings.py new file mode 100755 index 000000000000..c3cf7e9c1bc9 --- /dev/null +++ b/ilri/fix_initiative_mappings.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +# +# fix-initiative-mappings.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A script to help me fix collection mappings for items tagged with metadata +# for the 2030 Research Initiatives. It works by parsing the DSpace REST API +# to find collection names and handles, then checks existing items to see if +# their tagged Initiatives match their mapped collections. By default, the +# script will add missing mappings, but will not remove invalid ones (see the +# -r option). +# +# The script expects a CSV with item IDs, collections, and Initiatives, and +# outputs a CSV with updated collection mappings that you can import to DSpace +# using `dspace metadata-import -f file.csv`. +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install requests requests_cache colorama +# +# See: https://requests.readthedocs.org/en/master +# +# TODO: abstract some stuff so it's less messy + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +def parse_community(community_id): + request_url = ( + rest_base_url + + rest_communities_endpoint + + str(community_id) + + "?expand=collections" + ) + try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) + except requests.ConnectionError: + sys.stderr.write( + f"{Fore.RED}Could not connect to {args.rest_url}.{Fore.RESET}\n" + ) + exit(1) + + if request.status_code == requests.codes.ok: + collections = request.json()["collections"] + + # Initialize an empty dict of Initiative collections + initiative_collections = {} + + for collection in collections: + # We are only interested in Initiative collections + if initiative_column_name_prefix in collection["name"]: + initiative_collections.update( + {collection["name"]: collection["handle"]} + ) + else: + sys.stderr.write( + f"{Fore.RED}Status not OK! Request URL was: {request_url}{Fore.RESET}\n" + ) + exit(1) + + return initiative_collections + + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +parser = argparse.ArgumentParser( + description="Find all collections under a given DSpace community." +) +parser.add_argument("community", help="Community to process, for example: 10568/115087") +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-i", + "--input-file", + help="Path to input file (CSV)", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument( + "-o", + "--output-file", + help="Path to output file (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +parser.add_argument( + "-r", "--remove", help="Remove invalid mappings.", action="store_true" +) +parser.add_argument( + "-u", + "--rest-url", + help="URL of DSpace REST application.", + default="http://localhost:8080/rest", +) +args = parser.parse_args() + +handle = args.community + +# REST base URL and endpoints (with leading and trailing slashes) +rest_base_url = args.rest_url +rest_handle_endpoint = "/handle/" +rest_communities_endpoint = "/communities/" +rest_collections_endpoint = "/collections/" +rest_user_agent = "Alan Test Python Requests Bot" +initiatives_list_url = "https://ilri.github.io/cgspace-submission-guidelines/cg-contributor-initiative/cg-contributor-initiative.txt" + +# Column names in the CSV +id_column_name = "id" +collection_column_name = "collection" +initiative_column_name = "cg.contributor.initiative[en_US]" +# The prefix for all Initiative collection names +initiative_column_name_prefix = "CGIAR Initiative on " + +# Enable transparent request cache with one day expiry, as we are worried that +# Initiative names could have changed. +expire_after = timedelta(days=1) +requests_cache.install_cache("requests-cache", expire_after=expire_after) + +# Prune old cache entries +requests_cache.delete() + +# Fetch the controlled vocabulary for Initiatives +try: + request = requests.get( + initiatives_list_url, headers={"user-agent": rest_user_agent} + ) +except requests.ConnectionError: + sys.stderr.write( + f"{Fore.RED}Could not connect to REST API: {args.rest_url}.{Fore.RESET}\n" + ) + exit(1) + +# Convert the request test to a list so we can use it for lookups later +if request.status_code == requests.codes.ok: + initiatives_list = request.text.splitlines() + +# Fetch the metadata for the given community handle +request_url = rest_base_url + rest_handle_endpoint + str(handle) +try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) +except requests.ConnectionError: + sys.stderr.write( + f"{Fore.RED}Could not connect to REST API: {args.rest_url}.{Fore.RESET}\n" + ) + exit(1) + +# Check the request status +if request.status_code == requests.codes.ok: + handle_type = request.json()["type"] + + # Make sure the given handle is a community + if handle_type == "community": + community_id = request.json()["uuid"] + initiative_collections = parse_community(community_id) + else: + sys.stderr.write( + +f'{Fore.RED}{handle} is type "{handle_type}", not community.{Fore.RESET}\n' + ) + exit(1) +else: + sys.stderr.write( + f"{Fore.RED}Request failed. Are you sure {handle} is a valid handle?{Fore.RESET}\n" + ) + exit(1) + +# Open the input file +reader = csv.DictReader(args.input_file) + +# Check if the columns exist in the input file +if id_column_name not in reader.fieldnames: + sys.stderr.write( + f'{Fore.RED}Specified ID column "{id_column_name}" does not exist in the CSV.{Fore.RESET}' + ) + sys.exit(1) + +if collection_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'{Fore.RED}Specified collection column "{collection_column_name}" does not exist in the CSV.{Fore.RESET}' + ) + sys.exit(1) + +if initiative_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'{Fore.RED}Specified Initiative column "{initiative_column_name}" does not exist in the CSV.{Fore.RESET}' + ) + sys.exit(1) + +# Fields for the output CSV +fieldnames = [ + id_column_name, + collection_column_name, +] + +# Write the CSV header +writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) +writer.writeheader() + +# Iterate over the input file to check each item's Initiatives and collections +for input_row in reader: + item_id = input_row[id_column_name] + # Get the item's current collections + item_collections = input_row[collection_column_name].split("||") + item_initiatives = input_row[initiative_column_name].split("||") + + # First, iterate over the item's Initiatives so we can see if it is mapped + # to appropriate collections. + for item_initiative in item_initiatives: + if item_initiative in initiatives_list: + # This is ugly because our Initiative metadata uses the short + # names, but the corresponding collection names are prefixed + # with "CGIAR Initiative on ". + correct_initiative_collection = initiative_collections[ + f"{initiative_column_name_prefix}{item_initiative}" + ] + + if correct_initiative_collection in item_collections: + if args.debug: + print( + f"{Fore.GREEN}(Phase 1) {item_id} is correctly mapped to Initiative collection: {correct_initiative_collection} ({item_initiative}){Fore.RESET}" + ) + else: + print( + f"{Fore.YELLOW}(Phase 1) {item_id} mapping to Initiative collection: {correct_initiative_collection} ({item_initiative}){Fore.RESET}" + ) + + # Add the collection + item_collections.append(correct_initiative_collection) + elif not item_initiative: + if args.debug: + sys.stderr.write( + f"{Fore.RED}(Phase 1) {item_id} has no Initiative metadata{Fore.RESET}\n" + ) + else: + sys.stderr.write( + f"{Fore.RED}(Phase 1) {item_id} has invalid Initiative: {item_initiative}{Fore.RESET}\n" + ) + + # Empty list to hold incorrectly mapped collections we find for this item + incorrectly_mapped_collections = [] + + # Second, iterate over the item's collections to see if each one has corre- + # sponding Initiative metadata. + for item_collection in item_collections: + # Is it an Initiatve collection? + if item_collection in initiative_collections.values(): + # Now check if this item is tagged with metadata for the corre- + # sponding Initative. We technically want to do a reverse look- + # up in the dict to find the key (initiative) for the current + # collection, but that's not possible. Instead iterate over the + # dict's keys/values and do some sanity checks. + for initiative, collection in initiative_collections.items(): + # If current item collection matches the current Initiative + # collection then we need to check if the Initiative name + # also matches the item's metadata + if item_collection == collection: + # Remember the collection names use the long Initiative name + initiative_short_name = initiative.replace( + initiative_column_name_prefix, "" + ) + + if initiative_short_name in item_initiatives: + if args.debug: + print( + f"{Fore.GREEN}(Phase 2) {item_id} is correctly mapped to Initiative collection: {collection} ({initiative_short_name}){Fore.RESET}" + ) + + continue + else: + if args.remove: + sys.stderr.write( + f"{Fore.YELLOW}(Phase 2) {item_id} unmapping from Initiative collection: {collection} ({initiative_short_name}){Fore.RESET}\n" + ) + + incorrectly_mapped_collections.append(collection) + else: + sys.stderr.write( + f"{Fore.RED}(Phase 2) {item_id} is incorrectly mapped to Initiative collection: {collection} ({initiative_short_name}){Fore.RESET}\n" + ) + + for incorrectly_mapped_collection in incorrectly_mapped_collections: + item_collections.remove(incorrectly_mapped_collection) + + # We only need to save the item to the output CSV if we have changed its + # mappings. Check the mutated item_collections list against the original + # from the input CSV. + if item_collections != input_row[collection_column_name].split("||"): + # We only need to write the IDs and collections to the output file since we + # are not modifying any other metadata in the CSV. + output_row = { + id_column_name: input_row[id_column_name], + collection_column_name: "||".join(item_collections), + } + + writer.writerow(output_row) + +# close CSV files before we exit +args.input_file.close() +args.output_file.close() + +sys.exit(0) diff --git a/ilri/fix_maxmind_stats.py b/ilri/fix_maxmind_stats.py new file mode 100755 index 000000000000..c27bc0f31f20 --- /dev/null +++ b/ilri/fix_maxmind_stats.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# +# fix_maxmind_stats.py v0.0.1 +# +# Fix DSpace statistics containing literal MaxMind city JSON objects, for +# example: +# +# - com.maxmind.geoip2.record.City [ {"geoname_id":3936456,"names":{"de":"Lima","ru":"Лима","pt-BR":"Lima","ja":"リマ","en":"Lima","fr":"Lima","es":"Lima"}} ] +# - com.maxmind.geoip2.record.City [ {} ] +# +# See: https://github.com/DSpace/DSpace/issues/9118 +# +# The input file is a multi-line JSON exported from a DSpace 6.x Solr statistics +# core using solr-import-export-json. I exported all statistics documents that +# were affected using the Solr query "city:com*". +# +# Notes: +# +# I tried to use json from the stdlib but it doesn't support multi-line JSON. +# I tried to use pandas read_json(), but it introduces a whole bunch of other +# issues with data types, missing values, etc. In the end it was much simpler +# to use the jsonlines package. + +import json +import os + +import jsonlines + + +def fix_city(value): + """Clean city string.""" + + # Remove some crap so this can be a dict + value = value.replace("com.maxmind.geoip2.record.City [ ", "") + value = value.replace(" ]", "") + + # Try to read the cleaned string as a dict and access the English name + try: + # Assuming all city objects have an English version + value = json.loads(value)["names"]["en"] + except KeyError: + value = "" + + return value + + +input_filename = "/home/aorth/Downloads/stats-maxmind-cities.json" +output_filename = "/home/aorth/Downloads/stats-maxmind-cities-fixed.json" + +if os.path.exists(output_filename): + os.remove(output_filename) + +# Open the JSON file and iterate over each line as an object +with jsonlines.open(input_filename) as reader: + for obj in reader: + # Remove cities that are empty objects + if obj["city"] == "com.maxmind.geoip2.record.City [ {} ]": + del obj["city"] + else: + obj["city"] = fix_city(obj["city"]) + + # Write each line back out (appending) + with jsonlines.open(output_filename, mode="a") as writer: + writer.write(obj) diff --git a/ilri/fix_metadata_values.py b/ilri/fix_metadata_values.py new file mode 100755 index 000000000000..8ede1cd063dc --- /dev/null +++ b/ilri/fix_metadata_values.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# +# fix-metadata-values.py v1.2.6 +# +# Copyright Alan Orth +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a CSV with two columns: one with "bad" metadata values and one with +# correct values. Basically just a mass search and replace function for DSpace's +# PostgreSQL database. This script only works on DSpace 6+. Make sure to do a +# full `index-discovery -b` afterwards. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install psycopg colorama +# +# See: https://www.psycopg.org/psycopg3/docs +# + +import argparse +import csv +import logging +import signal +import sys + +import util +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Find and replace metadata values in the DSpace SQL database." +) +parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", +) +parser.add_argument( + "-f", + "--from-field-name", + help="Name of column with values to be replaced.", + required=True, +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument( + "-t", + "--to-field-name", + help="Name of column with values to replace.", + required=True, +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# open the CSV +reader = csv.DictReader(args.csv_file) + +# check if the from/to fields specified by the user exist in the CSV +if args.from_field_name not in reader.fieldnames: + logger.error( + Fore.RED + + f'Specified field "{args.from_field_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +if args.to_field_name not in reader.fieldnames: + logger.error( + Fore.RED + + f'Specified field "{args.to_field_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +cursor = conn.cursor() + +for row in reader: + if row[args.from_field_name] == row[args.to_field_name]: + # sometimes editors send me corrections with identical search/replace patterns + logger.debug( + Fore.YELLOW + + f"Skipping identical search and replace for value: {row[args.from_field_name]}" + + Fore.RESET + ) + + continue + + if "|" in row[args.to_field_name]: + # sometimes editors send me corrections with multi-value fields, which are supported in DSpace itself, but not here! + logger.debug( + Fore.YELLOW + + f"Skipping correction with invalid | character: {row[args.to_field_name]}" + + Fore.RESET + ) + + continue + + metadata_field_id = util.field_name_to_field_id(cursor, args.from_field_name) + + # Get item UUIDs for metadata values that will be updated + sql = "SELECT dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute(sql, (metadata_field_id, row[args.from_field_name])) + + if cursor.rowcount > 0: + if args.dry_run: + if not args.quiet: + logger.info( + Fore.GREEN + + f"(DRY RUN) Fixed {cursor.rowcount} occurences of: {row[args.from_field_name]}" + + Fore.RESET + ) + + # Since this a dry run we can continue to the next replacement + continue + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + sql = "UPDATE metadatavalue SET text_value=%s WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute( + sql, + ( + row[args.to_field_name], + metadata_field_id, + row[args.from_field_name], + ), + ) + + if cursor.rowcount > 0 and not args.quiet: + logger.info( + Fore.GREEN + + f"Fixed {cursor.rowcount} occurences of: {row[args.from_field_name]}" + + Fore.RESET + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[0]) + + +# commit changes after we are done +if not args.dry_run: + conn.commit() + +# close database connection before we exit +conn.close() + +# close input file +args.csv_file.close() + +sys.exit(0) diff --git a/ilri/generate_solr_statistics.py b/ilri/generate_solr_statistics.py new file mode 100755 index 000000000000..c3e69c222d66 --- /dev/null +++ b/ilri/generate_solr_statistics.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# +# generate_solr_statistics.py v0.0.1 +# +# Helper script to generate a bunch of Solr statistics based on a single +# reference statistic exported from a DSpace 6.3 Solr statistics core. +# +# The rationale for this was that we replaced a PDF bitstream and all +# downloads that had accumulated for the original PDF were deleted and +# the author wanted us to create the statistics again. According to the +# researcher, the item had ~3200 downloads from Mexico, Honduras, Brazil, +# Colombia, and Nicaragua before the PDF was deleted. + +import json +import os +import random +from datetime import datetime +from uuid import uuid4 + +import jsonlines + + +def random_datetime() -> datetime: + # When the item was uploaded to CGSpace + start_date = datetime.fromisoformat("2023-09-26T00:00:00Z") + # When the researcher last checked the statistics + end_date = datetime.fromisoformat("2023-10-20T00:00:00Z") + + dt = random.random() * (end_date - start_date) + start_date + + return dt + + +def random_city(country_code: str) -> str: + match country_code: + case "MX": + cities = [ + "Oaxaca", + "Juarez", + "Puebla", + "Mexico", + "Texmelucan", + "Cancún", + "Tultitlán", + "Minatitlán", + ] + case "HN": + cities = ["El Progreso", "Tegucigalpa", "San Pedro Sula", "La Ceiba"] + case "CO": + cities = [ + "Bogotá", + "Medellín", + "Cali", + "Jamundi", + "Barranquilla", + "Villavicencio", + ] + case "BR": + cities = [ + "Sao Luis", + "Rio De Janeiro", + "Guaira", + "Cruzeiro Do Sul", + "Santo Antonio De Jesus", + "Valinhos", + "Ituiutaba", + "Sobradinho", + "Maringa", + ] + case "NI": + cities = [ + "Chinandega", + "Managua", + "Masaya", + "San Juan Del Sur", + "Matagalpa", + "Estelí", + "León", + "Acoyapa", + ] + + return random.choice(cities) + + +def country_continent(country_code: str) -> str: + match country_code: + case "MX": + continent = "NA" + case "HN": + continent = "NA" + case "CO": + continent = "SA" + case "BR": + continent = "SA" + case "NI": + continent = "NA" + + return continent + + +# This is the reference statistic that we want to base our new +# statistics on. +# input_filename = "/home/aorth/Downloads/maria-no-atmire-schema.json" +input_filename = "/home/aorth/Downloads/maria.json" +output_filename = "/tmp/out.json" + +if os.path.exists(output_filename): + os.remove(output_filename) + +with open(input_filename, "r") as f: + json_data = json.load(f) + +# Check if this statistic has fields from the Atmire CUA schema +if "cua_version" in json_data: + atmire_cua = True +else: + atmire_cua = False + +# Delete some stuff that isn't required +del json_data["_version_"] # Solr adds this automatically on insert +# Too annoying to do for fake statistics, and not needed by any usage graphs +del json_data["ip"] +del json_data["dns"] +del json_data["latitude"] +del json_data["longitude"] + +# Don't think we need these. The *_ngram and *_search fields are custom Atmire +# modifications to the Solr schema that get copied from the relevant field on +# insert. +if atmire_cua: + del json_data["ip_ngram"] + del json_data["ip_search"] + del json_data["referrer_ngram"] + del json_data["referrer_search"] + del json_data["userAgent_ngram"] + del json_data["userAgent_search"] + del json_data["countryCode_ngram"] + del json_data["countryCode_search"] + +# Set a user agent. Hey it's me! +json_data[ + "userAgent" +] = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" + +# Open the output file. This is ghetto because we write each line individually +# in each loop iteration below. +with jsonlines.open(output_filename, mode="a") as writer: + for country_code in ["MX", "HN", "CO", "BR", "NI"]: + json_data["countryCode"] = country_code + if atmire_cua: + json_data["geoIpCountryCode"] = [country_code] + json_data["continent"] = country_continent(country_code) + + for x in range(640): + dt = random_datetime() + # Set a random time in our range + json_data["time"] = dt.strftime("%Y-%m-%dT%H:%M:%SZ") + if atmire_cua: + json_data["dateYear"] = dt.strftime("%Y") + json_data["dateYearMonth"] = dt.strftime("%Y-%m") + + # Set a random city from our list + json_data["city"] = random_city(country_code) + # Set a unique UUIDv4 (required in Solr stats schema) + json_data["uid"] = str(uuid4()) + + writer.write(json_data) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py new file mode 100755 index 000000000000..8d02f92d969e --- /dev/null +++ b/ilri/generate_thumbnails.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# +# generate-thumbnails.py 1.1.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- + +# Reads the filename and URL fields from a CSV, fetches the PDF, and generates +# a thumbnail using pyvips (libvips must be installed on the host). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests pyvips +# +# See: https://requests.readthedocs.org/en/master + +import argparse +import csv +import os.path +import re +import signal +import sys + +import pyvips +import requests +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Process thumbnails from filename.pdf to filename.jpg using libvips. Equivalent +# to the following shell invocation: +# +# vipsthumbnail 64661.pdf -s 600 -o '%s.jpg[Q=85,optimize_coding,strip]' +# +# vips is faster than GraphicsMagick/ImageMagick, uses less memory, and seems +# to generate better quality images. Note that libvips uses poppler instead of +# Ghostscript, which means that CMYK colorspace is not supported. We might need +# to do something about that... +# +# See: https://github.com/libvips/libvips/issues/379 +def create_thumbnail(row): + filename = row[args.filename_field_name] + thumbnail = os.path.splitext(filename)[0] + ".jpg" + # check if the file has been downloaded + if not os.path.isfile(filename): + if args.debug: + print(Fore.YELLOW + "> Missing {}.\n".format(filename) + Fore.RESET) + # check if we already have a thumbnail + elif os.path.isfile(thumbnail): + if args.debug: + print( + Fore.YELLOW + + f"> Thumbnail for {filename} already exists.\n" + + Fore.RESET + ) + else: + print(Fore.GREEN + f"> Creating thumbnail for {filename}..." + Fore.RESET) + vips_image = pyvips.Image.new_from_file(filename, access="sequential") + # Set max height to 600px + vips_thumbnail = vips_image.thumbnail_image(600) + vips_thumbnail.jpegsave(thumbnail, Q=85, optimize_coding=True, strip=True) + + return + + +def download_bitstream(row): + request_headers = {"user-agent": "CGSpace PDF bot"} + + # some records have multiple URLs separated by "||" + pattern = re.compile(r"\|\|") + urls = pattern.split(row[args.url_field_name]) + filenames = pattern.split(row[args.filename_field_name]) + for url, filename in zip(urls, filenames): + if args.debug: + print(f"URL: {url}") + print(f"File: {filename}") + + # check if file exists + if os.path.isfile(filename): + if args.debug: + print(Fore.YELLOW + f"> {filename} already downloaded." + Fore.RESET) + else: + if args.debug: + print(Fore.GREEN + f"> Downloading {filename}..." + Fore.RESET) + + response = requests.get(url, headers=request_headers, stream=True) + if response.status_code == 200: + with open(filename, "wb") as fd: + for chunk in response: + fd.write(chunk) + else: + print( + Fore.RED + + f"> Download failed (HTTP {response.status_code}), I will try again next time." + + Fore.RESET + ) + + return + + +if __name__ == "__main__": + # set the signal handler for SIGINT (^C) + signal.signal(signal.SIGINT, signal_handler) + + parser = argparse.ArgumentParser( + description="Download PDFs and generate thumbnails from files in a CSV." + ) + parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "-f", + "--filename-field-name", + help="Name of column with thumbnail filenames.", + default="filename", + ) + parser.add_argument( + "-u", + "--url-field-name", + help="Name of column with URLs for the PDFs.", + default="dc.description.url", + ) + parser.add_argument( + "-w", "--download-only", help="Only download the PDFs.", action="store_true" + ) + args = parser.parse_args() + + # open the CSV + reader = csv.DictReader(args.csv_file) + + # check if the filename and URL fields specified by the user exist in the CSV + if args.filename_field_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + 'Specified field "{}" does not exist in the CSV.\n'.format( + args.filename_field_name + ) + + Fore.RESET + ) + sys.exit(1) + if args.url_field_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + 'Specified field "{0}" does not exist in the CSV.\n'.format( + args.url_field_name + ) + + Fore.RESET + ) + sys.exit(1) + + for row in reader: + download_bitstream(row) + + if args.download_only is not True: + create_thumbnail(row) diff --git a/ilri/get_pdfs_dspace.py b/ilri/get_pdfs_dspace.py new file mode 100755 index 000000000000..40c5727544df --- /dev/null +++ b/ilri/get_pdfs_dspace.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# +# get_pdfs_dspace.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries a DSpace 6 REST API for bitstreams from a list of handles and then +# downloads them if they are PDFs. Input file is hardcoded at /tmp/handles.txt +# and should have one handle per line, for example: +# +# 10568/93010 +# 10568/75869 +# +# The original use for this was to download a list of PDFs corresponding with +# a certain search result. I generated the list of handles by extracting them +# from the results of an OpenSearch query where the user had asked for all the +# items matching the term "trade off" in the WLE community: +# +# $ http 'https://cgspace.cgiar.org/open-search/discover?scope=10568%2F34494&query=trade+off&rpp=100&start=0' User-Agent:'curl' > /tmp/wle-trade-off-page1.xml +# $ xmllint --xpath '//*[local-name()="entry"]/*[local-name()="id"]/text()' /tmp/wle-trade-off-page1.xml >> /tmp/ids.txt +# # ... and on and on for each page of results... +# $ sort -u /tmp/ids.txt > /tmp/ids-sorted.txt +# $ grep -oE '[0-9]+/[0-9]+' /tmp/ids.txt > /tmp/handles.txt +# +# This script is written for Python 3.7+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import logging +import os.path +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def resolve_bitstreams(handle): + url = f"{rest_base_url}/{rest_handle_endpoint}/{handle}" + request_params = {"expand": "bitstreams"} + request_headers = {"user-agent": rest_user_agent, "Accept": "application/json"} + response = requests.get(url, params=request_params, headers=request_headers) + + if response.status_code == 200: + bitstreams = response.json()["bitstreams"] + + if len(bitstreams) > 0: + pdf_bitstream_ids = list() + + for bitstream in bitstreams: + if bitstream["format"] == "Adobe PDF": + pdf_bitstream_ids.append(bitstream["uuid"]) + + if len(pdf_bitstream_ids) > 0: + download_bitstreams(pdf_bitstream_ids) + + return + + +def download_bitstreams(pdf_bitstream_ids): + import re + + for pdf_bitstream_id in pdf_bitstream_ids: + url = f"{rest_base_url}/{rest_bitstream_endpoint}/{pdf_bitstream_id}/retrieve" + request_headers = { + "user-agent": rest_user_agent, + } + + # do a HEAD request first to get the filename from the content disposition header + # See: https://stackoverflow.com/questions/31804799/how-to-get-pdf-filename-with-python-requests + response = requests.head(url, headers=request_headers) + + if response.status_code == 200: + content_disposition = response.headers["content-disposition"] + filename = re.findall("filename=(.+)", content_disposition)[0] + # filenames in the header have quotes so let's strip them in a super hacky way + filename_stripped = filename.strip('"') + logger.debug(f"> filename: {filename_stripped}") + + # check if file exists + if os.path.isfile(filename_stripped): + logger.debug( + Fore.YELLOW + + "> {} already downloaded.".format(filename_stripped) + + Fore.RESET + ) + else: + logger.info( + Fore.GREEN + + "> Downloading {}...".format(filename_stripped) + + Fore.RESET + ) + + response = requests.get( + url, headers={"user-agent": rest_user_agent}, stream=True + ) + if response.status_code == 200: + with open(filename_stripped, "wb") as fd: + for chunk in response: + fd.write(chunk) + else: + logger.error( + Fore.RED + + "> Download failed, I will try again next time." + + Fore.RESET + ) + + return + + +rest_base_url = "https://cgspace.cgiar.org/rest" +rest_handle_endpoint = "handle" +rest_bitstream_endpoint = "bitstreams" +rest_user_agent = "get_pdfs_dspace.py/0.0.2 (python / curl)" + +# Set local logging level to INFO +logger.setLevel(logging.INFO) +# Set the global log format to display just the message without the log level +logging.basicConfig(format="%(message)s") + +with open("/tmp/handles.txt", "r") as fd: + handles = fd.readlines() + +# Set up a transparent requests cache to be nice to the REST API +expire_after = timedelta(days=30) +requests_cache.install_cache("requests-cache", expire_after=expire_after) + +# prune old cache entries +requests_cache.delete() + +for handle in handles: + # strip the handle because it has a line feed (%0A) + handle = handle.strip() + + logger.info(f"Checking for PDF bitstreams in {handle}") + + resolve_bitstreams(handle) diff --git a/ilri/get_pdfs_scihub.py b/ilri/get_pdfs_scihub.py new file mode 100755 index 000000000000..7e56d572da20 --- /dev/null +++ b/ilri/get_pdfs_scihub.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# +# get_pdfs_scihub.py 0.0.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Attempts to download PDFs for given DOIs from Sci-Hub. We only do this for +# items we know are licensed Creative Commons (though not "ND"). The idea is +# to download the PDFs in order to create and upload thumbnails to CGSpace, +# not to upload the PDFs themselves (yet?). +# +# Input file should have one DOI per line, for example: +# +# https://doi.org/10.5194/bg-18-1481-2021 +# https://doi.org/10.5194/gmd-14-3789-2021 +# +# This script is written for Python 3.7+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama scidownl +# + +import argparse +import csv +import logging +import os.path +import signal +import sys + +import util +from colorama import Fore +from scidownl import scihub_download + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +def download_pdf(doi): + logger.info(f"Processing {doi}") + + filename = doi.replace("/", "-") + ".pdf" + filename = os.path.join(args.output_directory, filename) + + # check if file exists already + if os.path.isfile(filename): + logger.debug(Fore.GREEN + f"> {filename} already downloaded." + Fore.RESET) + + return + else: + logger.debug( + Fore.GREEN + f"> Attempting to download PDF for {doi}" + Fore.RESET + ) + + scihub_download(doi, paper_type="doi", out=filename) + + # check if the file was downloaded, since we have no way to know if it was + # successful. + if os.path.isfile(filename): + logger.info(Fore.YELLOW + f"> Successfully saved to: {filename}" + Fore.RESET) + else: + logger.debug(Fore.RED + "> Download unsuccessful." + Fore.RESET) + + +if __name__ == "__main__": + # set the signal handler for SIGINT (^C) + signal.signal(signal.SIGINT, signal_handler) + + parser = argparse.ArgumentParser(description="Download PDFs from Sci-Hub.") + parser.add_argument( + "-i", + "--input-file", + help="Path to input file.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "-o", + "--output-directory", + help="Name of directory to save files.", + required=False, + default=".", + ) + parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", + ) + args = parser.parse_args() + + # The default log level is WARNING, but we want to set it to DEBUG or INFO + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + # Set the global log format + logging.basicConfig(format="[%(levelname)s] %(message)s") + + dois = util.read_dois_from_file(args.input_file) + + for doi in dois: + download_pdf(doi) diff --git a/ilri/get_pdfs_unpaywall.py b/ilri/get_pdfs_unpaywall.py new file mode 100755 index 000000000000..b4ce702b68ba --- /dev/null +++ b/ilri/get_pdfs_unpaywall.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# +# get_pdfs_unpaywall.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Unpaywall API for DOIs read from a text file, one per line, +# and attempts to download fulltext PDFs. +# +import argparse +import logging +import os +import re +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +import util +from colorama import Fore + +# Create a root logger instance so that submodules can inherit our config. +# See: https://gist.github.com/gene1wood/73b715434c587d2240c21fc83fad7962#explanation-of-the-relationship-between-python-logging-root-logger-and-other-loggers +logger = logging.getLogger() + + +def resolve_doi(doi: str) -> None: + logger.info(f"Looking up DOI: {doi}") + + # Set filename based on DOI so we can check whether it has already been + # downloaded, ie: 10.3402/iee.v6.31191 → 10.3402-iee.v6.31191.pdf + pdf_filename = doi.replace("/", "-") + ".pdf" + pdf_file_path = os.path.join(args.output_directory, pdf_filename) + + # Check if file exists already so we can return early if so + if os.path.isfile(pdf_file_path): + logger.debug(Fore.GREEN + f"> {pdf_file_path} already downloaded." + Fore.RESET) + + return + + # Fetch the metadata for this DOI + request_url = f"https://api.unpaywall.org/v2/{doi}" + request_params = {"email": args.email} + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + logger.error(Fore.RED + "Connection error." + Fore.RESET) + + # I guess we have to exit + sys.exit(1) + + # Fail early if the DOI is not found in Unpaywall + if not request.ok: + logger.debug(f"> DOI not in Unpaywall (cached: {request.from_cache})") + + return + + logger.debug(f"> DOI in Unpaywall (cached: {request.from_cache})") + + data = request.json() + + file_downloaded = False + for oa_location in data["oa_locations"]: + if not file_downloaded: + try: + url_for_pdf = oa_location["url_for_pdf"] + + # Make sure there is actually something here, sometimes + # the value is blank! Bail out early to check the next + # source + if not url_for_pdf: + continue + + logger.info( + Fore.YELLOW + + f"> Attempting to download: {url_for_pdf}" + + Fore.RESET + ) + + # Try to download the file from this OA location + if util.download_file(url_for_pdf, pdf_file_path): + logger.info( + Fore.YELLOW + + f"> Successfully saved to: {pdf_file_path}" + + Fore.RESET + ) + + file_downloaded = True + else: + logger.debug(Fore.RED + "> Download unsuccessful." + Fore.RESET) + + # I guess this OA location is stale + file_downloaded = False + except: + # no PDF URL in this oa_location, try the next + continue + + +def signal_handler(signal, frame): + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Unpaywall REST API for metadata about DOIs." +) +parser.add_argument( + "-e", + "--email", + required=True, + help="Contact email to use in API requests so Unpaywall is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing DOIs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-directory", + help="Name of directory to save files.", + required=False, + default=".", +) +args = parser.parse_args() + +# Since we are running interactively we can override the log level and format. +# The default log level is WARNING, but we want to set it to DEBUG or INFO. +if args.debug: + logger.setLevel(logging.DEBUG) + logging.basicConfig(format="[D] %(message)s") +else: + logger.setLevel(logging.INFO) + logging.basicConfig(format="[I] %(message)s") + +# Install a transparent request cache +expire_after = timedelta(days=30) +requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) +) +requests_cache.delete() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the DOIs from there +if args.input_file: + dois = util.read_dois_from_file(args.input_file) + for doi in dois: + resolve_doi(doi) diff --git a/ilri/iso3166_lookup.py b/ilri/iso3166_lookup.py new file mode 100755 index 000000000000..356edfebc157 --- /dev/null +++ b/ilri/iso3166_lookup.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# +# iso3166-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the ISO 3166 dataset for countries read from a text file. Text file +# should have one organization per line. Results are saved to a CSV including +# the country name, whether it matched or not, and the type of match. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama pycountry requests requests-cache +# + +import argparse +import csv +import signal +import sys + +import pycountry +from colorama import Fore + + +# read countries from a text file, one per line +def read_countries_from_file(): + # initialize an empty list for countries + countries = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add organization that aren't already present + if line not in countries: + countries.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_countries(countries) + + +def resolve_countries(countries): + fieldnames = ["country", "match type", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for country in countries: + if args.debug: + sys.stderr.write( + Fore.GREEN + f"Looking up the country: {country!r}\n" + Fore.RESET + ) + + # check for exact match + if country.lower() in country_names: + print(f"Name match for {country!r}") + + writer.writerow( + {"country": country, "match type": "name", "matched": "true"} + ) + elif country.lower() in country_official_names: + print(f"Official name match for {country!r}") + + writer.writerow( + {"country": country, "match type": "official_name", "matched": "true"} + ) + elif country.lower() in country_common_names: + print(f"Common name match for {country!r}") + + writer.writerow( + { + "country": country, + "match type": "common_name", + "matched": "true", + } + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + f"No match for {country!r}\n" + Fore.RESET + ) + + writer.writerow( + { + "country": country, + "match type": "", + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query ISO 3166-1 to validate countries from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing countries to look up in ISO 3166-1 and ISO 3166-3.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# create empty lists to hold country names +country_names = [] +country_official_names = [] +country_common_names = [] + +# iterate over countries and append names to the appropriate lists. We can't use +# a list comprehension here because some countries don't have official_name, etc +# and they raise an AttributeError. Anyways, it's more efficient to iterate over +# the list of countries just once. +for country in pycountry.countries: + country_names.append(country.name.lower()) + + try: + country_official_names.append(country.official_name.lower()) + except AttributeError: + pass + + try: + country_common_names.append(country.common_name.lower()) + except AttributeError: + pass + +# Add names for historic countries from ISO 3166-3 +for country in pycountry.historic_countries: + country_names.append(country.name.lower()) + + try: + country_official_names.append(country.official_name.lower()) + except AttributeError: + pass + + try: + country_common_names.append(country.common_name.lower()) + except AttributeError: + pass + +read_countries_from_file() + +exit() diff --git a/ilri/iso_639_value_pairs.py b/ilri/iso_639_value_pairs.py new file mode 100755 index 000000000000..78e488ee6dc8 --- /dev/null +++ b/ilri/iso_639_value_pairs.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# +# Ghetto script to export value pairs for ISO 639-1 Alpha 2 codes from pycountry + +import pycountry + +for language in pycountry.languages: + try: + language.alpha_2 + except: + continue + + print(" ") + print(f" {language.name}") + print(f" {language.alpha_2}") + print(" ") + +print(" ") +print(" N/A") +print(" ") +print(" ") +print(" ") +print(" (Other)") +print(" other") +print(" ") diff --git a/ilri/migrate-fields.sh b/ilri/migrate-fields.sh new file mode 100755 index 000000000000..4b89e10563d3 --- /dev/null +++ b/ilri/migrate-fields.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# +# Moves DSpace metadatavalues from one field to another. Assumed to be running +# as the `postgres` Linux user. You MUST perform a full Discovery reindex after +# doing this, ie: index-discovery -bf +# +# Alan Orth, April, 2016 + +# Exit on first error +set -o errexit + +# Names of fields to move, in this format: +# +# old_field new_field +# +# fields are separated with tabs or spaces. Uses bash's `mapfile` to read into +# an array. +mapfile -t fields_to_move < 0: + if args.dry_run: + if not args.quiet: + print( + f"{Fore.GREEN}Would move {cursor.rowcount} occurences of: {line}{Fore.RESET}" + ) + + # Since this a dry run we can continue to the next line + continue + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + sql = "UPDATE metadatavalue SET metadata_field_id=%s WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute( + sql, + ( + to_field_id, + from_field_id, + line, + ), + ) + + if cursor.rowcount > 0: + if not args.quiet: + print( + f"{Fore.GREEN}Moved {cursor.rowcount} occurences of: {line}{Fore.RESET}" + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[0]) + +# close database connection before we exit +conn.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/orcid_authority_to_item.py b/ilri/orcid_authority_to_item.py new file mode 100755 index 000000000000..a2901beb223b --- /dev/null +++ b/ilri/orcid_authority_to_item.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +# +# orcid-authority-to-item.py 1.1.1 +# +# Copyright Alan Orth. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# --- +# +# Map ORCID identifiers from DSpace's Solr authority core by creating new cg.creator.id +# fields in each matching item. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama psycopg2-binary requests requests-cache +# + +import argparse +import signal +import sys +from datetime import timedelta + +import psycopg2 +import requests +import requests_cache +from colorama import Fore + + +def main(): + # parse the command line arguments + parser = argparse.ArgumentParser( + description="Map ORCID identifiers from the DSpace Solr authority core to cg.creator.id fields in each item." + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument("-db", "--database-name", help="Database name", required=True) + parser.add_argument( + "-u", "--database-user", help="Database username", required=True + ) + parser.add_argument( + "-p", "--database-pass", help="Database password", required=True + ) + parser.add_argument( + "-s", + "--solr-url", + help="URL of Solr application", + default="http://localhost:8080/solr", + ) + args = parser.parse_args() + + # set the signal handler for SIGINT (^C) so we can exit cleanly + signal.signal(signal.SIGINT, signal_handler) + + # get all ORCID identifiers from Solr authority core + read_identifiers_from_solr(args) + + +# query DSpace's authority Solr core for authority IDs with ORCID identifiers +def read_identifiers_from_solr(args): + # simple query from the 'authority' collection 2000 rows at a time (default is 1000) + solr_query_params = {"q": "orcid_id:*", "wt": "json", "rows": 2000} + + solr_url = args.solr_url + "/authority/select" + + res = requests.get(solr_url, params=solr_query_params) + + if args.debug: + numFound = res.json()["response"]["numFound"] + sys.stderr.write( + Fore.GREEN + + "Total number of Solr records with ORCID iDs: {0}\n".format( + str(numFound) + Fore.RESET + ) + ) + + # initialize an empty dictionary for authorities + # format will be: {'d7ef744b-bbd4-4171-b449-00e37e1b776f': '0000-0002-3476-272X', ...} + authorities = {} + + docs = res.json()["response"]["docs"] + # iterate over results and add ORCID iDs that aren't already in the list + # for example, we had 1600 ORCID iDs in Solr, but only 600 are unique + for doc in docs: + if doc["id"] not in authorities: + authorities.update({doc["id"]: doc["orcid_id"]}) + + add_orcid_identifiers(args, authorities) + + +# Query ORCID's public API for names associated with an identifier. Prefers to use +# the "credit-name" field if it is present, otherwise will default to using the +# "given-names" and "family-name" fields. +def resolve_orcid_identifier(args, orcid): + # ORCID API endpoint, see: https://pub.orcid.org + orcid_api_base_url = "https://pub.orcid.org/v2.1/" + orcid_api_endpoint = "/person" + + # fetch names associated with an ORCID identifier from the ORCID API + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Looking up the names associated with ORCID iD: {0}\n".format(orcid) + + Fore.RESET + ) + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + + # cache HTTP 200 and 404 responses, because ORCID uses HTTP 404 when an identifier doesn't exist + requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) + ) + + # build request URL for current ORCID ID + request_url = orcid_api_base_url + orcid.strip() + orcid_api_endpoint + + # ORCID's API defaults to some custom format, so tell it to give us JSON + request = requests.get(request_url, headers={"Accept": "application/json"}) + + # prune old cache entries + requests_cache.delete() + + # Check the request status + if request.status_code == requests.codes.ok: + # read response JSON into data + data = request.json() + + # make sure name element is not null + if data["name"]: + # prefer credit-name if present and not blank + if ( + data["name"]["credit-name"] + and data["name"]["credit-name"]["value"] != "" + ): + line = data["name"]["credit-name"]["value"] + # otherwise use given-names + family-name + # make sure given-names is not null + elif data["name"]["given-names"]: + line = data["name"]["given-names"]["value"] + # make sure family-name is not null + if data["name"]["family-name"]: + line = line + " " + data["name"]["family-name"]["value"] + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Warning: ignoring null family-name element.\n" + + Fore.RESET + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Warning: skipping identifier with null name element.\n\n" + + Fore.RESET + ) + # HTTP 404 means that the API url or identifier was not found. If the + # API URL is correct, let's assume that the identifier was not found. + elif request.status_code == 404: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Warning: skipping missing identifier (API request returned HTTP 404).\n\n" + + Fore.RESET + ) + else: + sys.stderr.write(Fore.RED + "Error: request failed.\n" + Fore.RESET) + exit(1) + + return line + + +def add_orcid_identifiers(args, authorities): + # connect to database + try: + conn_string = "dbname={0} user={1} password={2} host=localhost".format( + args.database_name, args.database_user, args.database_pass + ) + conn = psycopg2.connect(conn_string) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Connected to the database.\n" + Fore.RESET) + except psycopg2.OperationalError: + sys.stderr.write(Fore.RED + "Unable to connect to the database.\n" + Fore.RESET) + exit(1) + + # iterate over all authorities + for authority_id in authorities: + # save orcid for current authority a little more cleanly + orcid = authorities[authority_id] + + # get name associated with this orcid identifier + name = resolve_orcid_identifier(args, orcid) + creator = "{0}: {1}".format(name, orcid) + + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Processing authority ID {0} with ORCID iD: {1}\n".format( + authority_id, orcid + ) + + Fore.RESET + ) + + with conn: + # cursor will be closed after this block exits + # see: http://initd.org/psycopg/docs/usage.html#with-statement + with conn.cursor() as cursor: + # find all metadata records with this authority id + # resource_type_id 2 is item metadata, metadata_field_id 3 is author + sql = "SELECT resource_id, place FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=3 AND authority=%s" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (authority_id,)) + records_with_authority = cursor.fetchall() + + if len(records_with_authority) >= 0: + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Checking {0} items for authority ID {1}.\n".format( + len(records_with_authority), authority_id + ) + + Fore.RESET + ) + + # iterate over results for current authority_id to add cg.creator.id metadata + for record in records_with_authority: + resource_id = record[0] + # author name and orcid identifier + text_value = creator + place = record[1] + confidence = -1 + + # get the metadata_field_id for cg.creator.id field + sql = "SELECT metadata_field_id FROM metadatafieldregistry WHERE metadata_schema_id=2 AND element='creator' AND qualifier='id'" + cursor.execute(sql) + metadata_field_id = cursor.fetchall()[0] + + # first, check if there is an existing cg.creator.id here (perhaps the script crashed before?) + # resource_type_id 2 is item metadata + sql = "SELECT * from metadatavalue WHERE resource_id=%s AND metadata_field_id=%s AND text_value=%s AND place=%s AND confidence=%s AND resource_type_id=2" + cursor.execute( + sql, + ( + resource_id, + metadata_field_id, + text_value, + place, + confidence, + ), + ) + records_with_orcid = cursor.fetchall() + + if len(records_with_orcid) == 0: + print( + "Adding ORCID identifier to item {0}: {1}".format( + resource_id, creator + ) + ) + + # metadatavalue IDs come from a PostgreSQL sequence that increments when you call it + cursor.execute("SELECT nextval('metadatavalue_seq')") + metadata_value_id = cursor.fetchone()[0] + + sql = "INSERT INTO metadatavalue (metadata_value_id, resource_id, metadata_field_id, text_value, place, confidence, resource_type_id) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute( + sql, + ( + metadata_value_id, + resource_id, + metadata_field_id, + text_value, + place, + confidence, + 2, + ), + ) + else: + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Item {0} already has an ORCID identifier for {1}.\n".format( + resource_id, creator + ) + + Fore.RESET + ) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Disconnecting from database.\n" + Fore.RESET) + + # close the database connection before leaving + conn.close() + + +def signal_handler(signal, frame): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ilri/parse_iso_codes.py b/ilri/parse_iso_codes.py new file mode 100755 index 000000000000..d8000aad0770 --- /dev/null +++ b/ilri/parse_iso_codes.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# +# parse-iso-codes.py v0.0.1 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the Research Organization Registry dataset for organizations read +# from a text file. Text file should have one organization per line. Results +# are saved to a CSV including the organization and whether it matched or not. +# +# This script is written for Python 3.6+. +# + +import argparse +import json +import signal +import sys + + +def choose_country_name(country: dict): + # Prefer the common name if it exists! Otherwise, prefer the shorter of name + # and official_name. + try: + return country["common_name"] + except KeyError: + pass + + try: + country_name = country["name"] + except KeyError: + country_name = False + + try: + country_official_name = country["official_name"] + except KeyError: + country_official_name = False + + if country_name and not country_official_name: + return country_name + + if country_official_name and not country_name: + return country_official_name + + if len(country["name"]) < len(country["official_name"]): + return country["name"] + else: + return country["official_name"] + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Parse iso_3166-1.json from Debian's iso-codes package to a list of countries." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="Path to iso_3166-1.json from Debian iso-codes package.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# read the list of countries +countries_json = json.load(args.input_file) + +for country in countries_json["3166-1"]: + country_name = choose_country_name(country) + + args.output_file.write(f"{country_name}\n") + +args.input_file.close() +args.output_file.close() + +exit() diff --git a/ilri/post_bitstreams.py b/ilri/post_bitstreams.py new file mode 100755 index 000000000000..541f25d0f194 --- /dev/null +++ b/ilri/post_bitstreams.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python3 +# +# post_bitstreams.py 0.1.3 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A script to read item IDs and filenames from a CSV file and update existing +# items in a DSpace repository via the REST API. Specify an email for a DSpace +# user with administrator privileges when running: +# +# $ ./post_bitsreams.py -i items.csv -e me@example.com -p 'fuu!' +# +# The CSV input file should have DSpace item IDs, filenames, and bundle names, +# for example: +# +# id,filename,bundle +# 804351af-64eb-4e4a-968f-4d3be61358a8,file1.pdf__description:Report,ORIGINAL +# 82b8c92c-fd6e-4b30-a704-5fbdc1cc6d1c,file2.pdf__description:Journal Article,ORIGINAL +# 82b8c92c-fd6e-4b30-a704-5fbdc1cc6d1c,thumbnail.png__description:libvips thumbnail,THUMBNAIL +# +# Optionally specify the bitstream description using the SAFBuilder syntax. +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# TODO: allow overwriting by bitstream description +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests +# + +import argparse +import csv +import logging +import os.path +import signal +import sys + +import requests +from colorama import Fore + +# Create a local logger instance for this module. We don't do any configuration +# because this module might be used elsewhere that will have its own logging +# configuration. +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +def login(user: str, password: str): + """Log into the DSpace REST API. + + Equivalent to the following request with httpie or curl: + + $ http -f POST http://localhost:8080/rest/login email=aorth@fuuu.com password='fuuuuuu' + + :param user: email of user with permissions to update the item (should probably be an admin). + :param password: password of user. + :returns: JSESSION value for the session. + """ + + request_url = rest_login_endpoint + headers = {"user-agent": user_agent} + data = {"email": args.user, "password": args.password} + + logger.info("Logging in...") + + try: + request = requests.post(rest_login_endpoint, headers=headers, data=data) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + if request.status_code != requests.codes.ok: + logger.error(Fore.RED + "> Login failed." + Fore.RESET) + + sys.exit(1) + + try: + jsessionid = request.cookies["JSESSIONID"] + except KeyError: + logger.error( + Fore.RED + + f"Login failed (HTTP {request.status_code}): missing JESSSIONID cookie in response...?" + + Fore.RESET + ) + + sys.exit(1) + + logger.debug( + Fore.GREEN + f"Login successful, new JSESSIONID: {jsessionid}" + Fore.RESET + ) + + return jsessionid + + +def check_session(jsessionid: str): + """Check the authentication status of the specified JSESSIONID. + + :param jsessionid: JSESSIONID value for a previously authenticated session. + :returns: bool + """ + + request_url = rest_status_endpoint + headers = {"user-agent": user_agent, "Accept": "application/json"} + cookies = {"JSESSIONID": jsessionid} + + logger.debug(f"Checking status of existing session: {jsessionid}" + Fore.RESET) + + try: + request = requests.get(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + if request.status_code == requests.codes.ok: + if not request.json()["authenticated"]: + logger.warning(Fore.RED + f"Session expired: {jsessionid}" + Fore.RESET) + + return False + else: + logger.error(Fore.RED + "Error checking session status." + Fore.RESET) + + return False + + logger.debug(Fore.GREEN + f"Session valid: {jsessionid}" + Fore.RESET) + + return True + + +def check_item(item_id: str, bundle: str): + """Check if the item already has bitstreams. + + Equivalent to the following request with httpie or curl: + + $ http 'http://localhost:8080/rest/items/804351af-64eb-4e4a-968f-4d3be61358a8?expand=bitstreams,metadata' \ + Cookie:JSESSIONID=B3B9C82F257BCE1773E6FB1EA5ACD774 + + By default this will return True if the item has any bitstreams in the named + bundle and False if the bundle is empty. If the user has asked to overwrite + bitstreams then we will do that first, and return False once the bundle is + empty. + + :param item_id: uuid of item in the DSpace repository. + :returns: bool + """ + + request_url = f"{rest_items_endpoint}/{item_id}" + headers = {"user-agent": user_agent} + # Not strictly needed here for permissions, but let's give the session ID + # so that we don't allocate unecessary resources on the server. + cookies = {"JSESSIONID": jsessionid} + request_params = {"expand": "bitstreams,metadata"} + + try: + request = requests.get( + request_url, headers=headers, cookies=cookies, params=request_params + ) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + # If the item doesn't exist, return True early so we don't try to upload a + # bitstream + if request.status_code == 404: + logger.warning(Fore.RED + "Item not found." + Fore.RESET) + return True + + if request.status_code == requests.codes.ok: + data = request.json() + + # List comprehension to filter out bitstreams that belong to the bundle + # we're interested in + bitstreams_in_bundle = [ + bitstream + for bitstream in data["bitstreams"] + if bitstream["bundleName"] == bundle + ] + + if len(bitstreams_in_bundle) == 0: + # Return False, meaning the item does not have a bitstream in this bundle yet + return False + + # We have bitstreams, so let's see if the user wants to overwrite them + if args.overwrite_format: + bitstreams_to_overwrite = [ + bitstream + for bitstream in bitstreams_in_bundle + if bitstream["format"] in args.overwrite_format + ] + + # Item has bitstreams, but none matching our overwrite format. Let's + # err on the side of caution and return True so that we don't upload + # another one into the bundle. + if len(bitstreams_to_overwrite) == 0: + logger.debug( + "Existing bitstreams, but none matching our overwrite formats." + ) + + return True + + for bitstream in bitstreams_to_overwrite: + if args.dry_run: + logger.info( + Fore.YELLOW + + f"> (DRY RUN) Deleting bitstream: {bitstream['name']} ({bitstream['uuid']})" + + Fore.RESET + ) + + else: + if delete_bitstream(bitstream["uuid"]): + logger.info( + Fore.YELLOW + + f"> Deleted bitstream: {bitstream['name']} ({bitstream['uuid']})" + + Fore.RESET + ) + + # Return False, indicating there are no bitstreams in this bundle + return False + else: + logger.debug( + f"> Skipping item with existing bitstream(s) in {bundle} bundle" + ) + + return True + + # If we get here, assume the item has a bitstream and return True so we + # don't upload another. + return True + + +def delete_bitstream(bitstream_id: str): + """Delete a bitstream. + + Equivalent to the following request with httpie or curl: + + $ http DELETE 'http://localhost:8080/rest/bitstreams/fca0fd2a-630e-4a34-b260-f645c8f2b027' \ + Cookie:JSESSIONID=B3B9C82F257BCE1773E6FB1EA5ACD774 + + :param bitstream_id: uuid of bitstream in the DSpace repository. + :returns: bool + """ + + request_url = f"{rest_bitstreams_endpoint}/{bitstream_id}" + headers = {"user-agent": user_agent} + cookies = {"JSESSIONID": jsessionid} + + try: + request = requests.delete(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + if request.status_code == requests.codes.ok: + return True + else: + return False + + +def upload_file(item_id: str, bundle: str, filename: str, description): + """Upload a file to an existing item in the DSpace repository. + + Equivalent to the following request with httpie or curl: + + http POST \ + 'http://localhost:8080/rest/items/21c0db9d-6c35-4111-9ca1-2c1345f44e40/bitstreams?name=file.pdf&description=Book&bundleName=ORIGINAL' \ + Cookie:JSESSIONID=0BDB219712F4F7DDB6055C1906F3E24B < file.pdf + + :param item_id: UUID of item to post the file to. + :param bundle: Name of the bundle to upload bitstream to, ie ORIGINAL, THUMBNAIL, etc (will be created if it doesn't exist). + :param filename: Name of the file to upload (must exist in the same directory as the script). + :param description: Bitstream description for this file. + :returns: bool + """ + + request_url = f"{rest_items_endpoint}/{item_id}/bitstreams" + headers = {"user-agent": user_agent} + cookies = {"JSESSIONID": jsessionid} + + # Description is optional + if description: + request_params = { + "name": filename, + "bundleName": bundle, + "description": description, + } + else: + request_params = {"name": filename, "bundleName": bundle} + + try: + with open(filename, "rb") as file: + # I'm not sure why, but we need to use data instead of files here + # See: https://stackoverflow.com/questions/12385179/how-to-send-a-multipart-form-data-with-requests-in-python + # See: https://stackoverflow.com/questions/43500502/send-file-through-post-without-content-disposition-in-python + request = requests.post( + request_url, + headers=headers, + cookies=cookies, + params=request_params, + data=file.read(), + ) + except requests.ConnectionError: + logger.error( + Fore.RED + f"> Could not connect to REST API: {request_url}" + Fore.RESET + ) + + sys.exit(1) + except FileNotFoundError: + logger.error(Fore.RED + f"> Could not open {filename}" + Fore.RESET) + + return False + + if request.status_code == requests.codes.ok: + return True + else: + logger.error(Fore.RED + f"> Error uploading file: {filename}" + Fore.RESET) + + return False + + +if __name__ == "__main__": + # Set the signal handler for SIGINT (^C) + signal.signal(signal.SIGINT, signal_handler) + + parser = argparse.ArgumentParser( + description="Post bitstreams to existing items in a DSpace 6.x repository." + ) + parser.add_argument( + "-d", "--debug", help="Print debug messages.", action="store_true" + ) + parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "-u", + "--rest-url", + help="URL of the DSpace 6.x REST API.", + default="http://localhost:8080/rest", + ) + parser.add_argument("-e", "--user", help="Email address of administrator user.") + parser.add_argument( + "--overwrite-format", + help="Bitstream formats to overwrite. Specify multiple formats separated by a space. Use this carefully, test with dry run first!", + choices=["PNG", "JPEG", "GIF", "Adobe PDF", "WebP"], + action="extend", + nargs="+", + ) + parser.add_argument("-p", "--password", help="Password of administrator user.") + parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "-s", "--jsessionid", help="JESSIONID, if previously authenticated." + ) + args = parser.parse_args() + + # The default log level is WARNING, but we want to set it to DEBUG or INFO + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + # Since we're running interactively we can set the preferred log format for + # the logging module during this invocation. + logging.basicConfig(format="[%(levelname)s] %(message)s") + + # DSpace 6.x REST API base URL and endpoints + rest_base_url = args.rest_url + rest_login_endpoint = f"{rest_base_url}/login" + rest_status_endpoint = f"{rest_base_url}/status" + rest_items_endpoint = f"{rest_base_url}/items" + rest_bitstreams_endpoint = f"{rest_base_url}/bitstreams" + user_agent = "Alan Orth (ILRI) Python bot" + + # If the user passed a session ID then we should check if it is valid first. + # Otherwise we should login and get a new session. + if args.jsessionid: + if check_session(args.jsessionid): + jsessionid = args.jsessionid + else: + jsessionid = login(args.user, args.password) + else: + jsessionid = login(args.user, args.password) + + try: + # Open the CSV + reader = csv.DictReader(args.csv_file) + + logger.debug(f"Opened {args.csv_file.name}") + except FileNotFoundError: + logger.error(Fore.RED + f"Could not open {args.csv_file.name}" + Fore.RESET) + + # Check if the required fields exist in the CSV + for field in ["id", "filename", "bundle"]: + if field not in reader.fieldnames: + logger.error( + Fore.RED + + f"Expected field {field} does not exist in the CSV." + + Fore.RESET + ) + + sys.exit(1) + + for row in reader: + item_id = row["id"] + bundle = row["bundle"] + + # Check if this item already has a bitstream in this bundle (check_item + # returns True if the bundle already has a bitstream). + logger.info(f"{item_id}: checking for existing bitstreams in {bundle} bundle") + + if not check_item(item_id, bundle): + # Check if there is a description for this filename + try: + filename = row["filename"].split("__description:")[0] + description = row["filename"].split("__description:")[1] + except IndexError: + filename = row["filename"].split("__description:")[0] + description = False + + if not os.path.isfile(filename): + logger.info( + f"{Fore.YELLOW}> File not found, skipping: {filename}{Fore.RESET}" + ) + + continue + + if args.dry_run: + logger.info( + f"{Fore.YELLOW}> (DRY RUN) Uploading file: {filename}{Fore.RESET}" + ) + else: + if upload_file(item_id, bundle, filename, description): + logger.info( + f"{Fore.YELLOW}> Uploaded file: {filename} ({bundle}){Fore.RESET}" + ) diff --git a/ilri/post_ciat_pdfs.py b/ilri/post_ciat_pdfs.py new file mode 100755 index 000000000000..4d4d2313818d --- /dev/null +++ b/ilri/post_ciat_pdfs.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +# +# post-ciat-pdfs.py 0.0.1 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A script to read item IDs and URLs from a CSV file and update existing items +# in a DSpace repository via the REST API. Developed when we had a corporate +# website with thousands of PDFs go offline and wanted to upload the PDFs to +# their existing metadata-only accessions in our respository. Specify an email +# and for a DSpace user with administrator privileges when running: +# +# $ ./post-ciat-pdfs.py -i items.csv -e me@example.com -p 'fuu!' +# +# The CSV input file should have DSpace item IDs (UUID) and URLs, ie: +# +# id,url +# 804351af-64eb-4e4a-968f-4d3be61358a8,http://example.com/library/file1.pdf +# 82b8c92c-fd6e-4b30-a704-5fbdc1cc6d1c,http://example.com/library/file2.pdf +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). If your CSV file has a large number of URLs +# to download you can run it first in download-only mode with the "-w" option. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests +# + +import argparse +import csv +import os.path +import signal +import sys +from urllib.parse import unquote, urlparse + +import requests +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +def login(user: str, password: str): + """Log into the DSpace REST API. + + Equivalent to the following request with httpie or curl: + + $ http -f POST http://localhost:8080/rest/login email=aorth@fuuu.com password='fuuuuuu' + + :param user: email of user with permissions to update the item (should probably be an admin). + :param password: password of user. + :returns: JSESSION value for the session. + """ + + headers = {"user-agent": user_agent} + data = {"email": args.user, "password": args.password} + + print("Logging in...") + + try: + request = requests.post(rest_login_endpoint, headers=headers, data=data) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + f" Could not connect to REST API: {rest_login_endpoint}\n" + + Fore.RESET + ) + + exit(1) + + if request.status_code != requests.codes.ok: + sys.stderr.write(Fore.RED + " Login failed.\n" + Fore.RESET) + + exit(1) + + jsessionid = request.cookies["JSESSIONID"] + + if args.debug: + sys.stderr.write( + Fore.GREEN + f" Logged in using JSESSIONID: {jsessionid}\n" + Fore.RESET + ) + + return jsessionid + + +def check_session(jsessionid: str): + """Check the authentication status of the specified JSESSIONID. + + :param jsessionid: JSESSIONID value for a previously authenticated session. + :returns: bool + """ + + request_url = rest_status_endpoint + headers = {"user-agent": user_agent, "Accept": "application/json"} + cookies = {"JSESSIONID": jsessionid} + + try: + request = requests.get(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + f" Could not connect to REST API: {args.request_url}\n" + + Fore.RESET + ) + + exit(1) + + if request.status_code == requests.codes.ok: + if not request.json()["authenticated"]: + sys.stderr.write( + Fore.RED + f" Session expired: {jsessionid}\n" + Fore.RESET + ) + + return False + else: + sys.stderr.write(Fore.RED + " Error checking session status.\n" + Fore.RESET) + + return False + + return True + + +def url_to_filename(url: str): + """Return filename from a URL. + + Uses the following process to extract the filename from a given URL: + + 1. Split path component on slash like ['docs', 'file.pdf'] + 2. Take last element ([-1]) + 3. URL unencode using unquote() so we don't have "file%20name.pdf" + + :param url: URL of a PDF file to download, for example "https://example.com/docs/file.pdf" + :returns: filename, for example "file.pdf" + """ + + return unquote(urlparse(url).path.split("/")[-1]) + + +def check_item(row: dict): + """Check if the item already has bitstreams. + + Equivalent to the following request with httpie or curl: + + $ http 'http://localhost:8080/rest/items/804351af-64eb-4e4a-968f-4d3be61358a8?expand=bitstreams,metadata' \ + Cookie:JSESSIONID=B3B9C82F257BCE1773E6FB1EA5ACD774 + + To be safe, and to save myself from having to write extra logic, we only + want to upload files to items that don't already have one. + + :param row: row from the CSV file containing the item ID and URL of a file to download. + """ + + url = row["url"] + item_id = row["id"] + + request_url = f"{rest_items_endpoint}/{item_id}" + headers = {"user-agent": user_agent} + # Not strictly needed here for permissions, but let's give the session ID + # so that we don't allocate unecessary resources on the server. + cookies = {"JSESSIONID": jsessionid} + request_params = {"expand": "bitstreams,metadata"} + + try: + request = requests.get( + request_url, headers=headers, cookies=cookies, params=request_params + ) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + f" Could not connect to REST API: {args.request_url}\n" + + Fore.RESET + ) + + exit(1) + + if request.status_code == requests.codes.ok: + data = request.json() + + if len(data["bitstreams"]) == 0: + filename = url_to_filename(url) + + # Find the item type so we can use it as the bitstream description. + # Note that we don't check for null or empty here. + for field in data["metadata"]: + if field["key"] == "dcterms.type": + item_type = field["value"] + + if args.debug: + print(f"{item_id}: uploading {filename}") + + if upload_file(item_id, filename, item_type): + print(Fore.YELLOW + f"{item_id}: uploaded {filename}" + Fore.RESET) + else: + if args.debug: + sys.stderr.write( + f"{item_id}: skipping item with existing bitstream(s)\n" + ) + + +def download_file(url: str): + filename = url_to_filename(url) + + request_headers = {"user-agent": user_agent} + + # Check if file already exists + if os.path.isfile(filename): + if args.debug: + print(f"> {filename} already downloaded.") + else: + print(f"> Downloading {filename}...") + + response = requests.get(row["url"], headers=request_headers, stream=True) + if response.status_code == 200: + with open(filename, "wb") as fd: + for chunk in response: + fd.write(chunk) + else: + print( + Fore.RED + + f" > Download failed (HTTP {response.status_code})" + + Fore.RESET + ) + + return False + + return True + + +def upload_file(item_id: str, filename: str, item_type: str): + """Upload a file to an existing item in the DSpace repository. + + Equivalent to the following request with httpie or curl: + + http POST \ + 'http://localhost:8080/rest/items/21c0db9d-6c35-4111-9ca1-2c1345f44e40/bitstreams?name=file.pdf&description=Book' \ + Cookie:JSESSIONID=0BDB219712F4F7DDB6055C1906F3E24B < file.pdf + + This will upload the bitstream into the item's ORIGINAL bundle. + + TODO: parameterize the bundle name so that we could upload a bunch of thumbnails. + + :param item_id: UUID of item to post the file to. + :param filename: Name of the file to upload (must exist in the same directory as the script). + :param item_type: Type of the item, to be used for the bitstream description. + :returns: bool + """ + + try: + # Open the file + file = open(filename, "rb") + except FileNotFoundError: + sys.stderr.write(Fore.RED + f" Could not open {filename}\n" + Fore.RESET) + + request_url = f"{rest_items_endpoint}/{item_id}/bitstreams" + headers = {"user-agent": user_agent} + cookies = {"JSESSIONID": jsessionid} + request_params = {"name": filename, "description": item_type} + + try: + request = requests.post( + request_url, + headers=headers, + cookies=cookies, + params=request_params, + files={"file": file}, + ) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + f" Could not connect to REST API: {request_url}\n" + Fore.RESET + ) + + exit(1) + + if request.status_code == requests.codes.ok: + file.close() + + return True + else: + print(Fore.RED + f" Error uploading file: {filename}" + Fore.RESET) + file.close() + + return False + + +parser = argparse.ArgumentParser( + description="Download files and post them to existing items in a DSpace 6.x repository." +) +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-u", + "--rest-url", + help="URL of the DSpace 6.x REST API.", + default="http://localhost:8080/rest", +) +parser.add_argument("-e", "--user", help="Email address of administrator user.") +parser.add_argument("-p", "--password", help="Password of administrator user.") +parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument( + "-s", "--jsessionid", help="JESSIONID, if previously authenticated." +) +parser.add_argument( + "-w", "--download-only", help="Only download the files.", action="store_true" +) +args = parser.parse_args() + +# DSpace 6.x REST API base URL and endpoints +rest_base_url = args.rest_url +rest_login_endpoint = f"{rest_base_url}/login" +rest_status_endpoint = f"{rest_base_url}/status" +rest_items_endpoint = f"{rest_base_url}/items" +user_agent = "Alan Orth (ILRI) Python bot" + +# Set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# If the user passed a session ID then we should check if it is valid first. +# Otherwise we should login and get a new session. If the user requested for +# download only mode then we skip authentication checks. +if args.jsessionid and not args.download_only: + if check_session(args.jsessionid): + jsessionid = args.jsessionid + else: + jsessionid = login(args.user, args.password) +elif not args.download_only: + jsessionid = login(args.user, args.password) + +if args.debug: + sys.stderr.write(f"Opening {args.csv_file.name}\n") + +try: + # Open the CSV + reader = csv.DictReader(args.csv_file) +except FileNotFoundError: + sys.stderr.write(Fore.RED + f" Could not open {args.csv_file.name}\n" + Fore.RESET) + +# Check if the item ID and URL fields exist in the CSV +for field in ["id", "url"]: + if field not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f"Expected field {field} does not exist in the CSV.\n" + + Fore.RESET + ) + sys.exit(1) + +for row in reader: + if download_file(row["url"]): + if not args.download_only: + check_item(row) diff --git a/ilri/resolve_addresses.py b/ilri/resolve_addresses.py new file mode 100755 index 000000000000..915bc3586fb7 --- /dev/null +++ b/ilri/resolve_addresses.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +# +# resolve-addresses.py 0.4.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the IPAPI.co API for information about IP addresses read from a text +# file. The text file should have one address per line (comments and invalid +# lines are skipped). Optionally looks up IPs in the AbuseIPDB.com if you pro- +# vide an API key. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install requests requests-cache colorama +# + +import argparse +import csv +import ipaddress +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +def valid_ip(address): + try: + ipaddress.ip_address(address) + + return True + + except ValueError: + return False + + +# read IPs from a text file, one per line +def read_addresses_from_file(): + # initialize an empty list for IP addresses + addresses = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # skip any lines that aren't valid IPs + if not valid_ip(line): + continue + + # iterate over results and add addresses that aren't already present + if line not in addresses: + addresses.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_addresses(addresses) + + +def resolve_addresses(addresses): + if args.abuseipdb_api_key: + fieldnames = ["ip", "org", "asn", "country", "abuseConfidenceScore"] + else: + fieldnames = ["ip", "org", "asn", "country"] + + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + # cache HTTP 200 responses + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + # iterate through our addresses + for address in addresses: + print(f"Looking up {address} in IPAPI") + + # build IPAPI request URL for current address + request_url = f"https://ipapi.co/{address}/json" + + request = requests.get(request_url) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + address_org = data["org"] + address_asn = data["asn"] + address_country = data["country"] + + row = { + "ip": address, + "org": address_org, + "asn": address_asn, + "country": address_country, + } + + if args.abuseipdb_api_key: + print(f"→ Looking up {address} in AbuseIPDB") + + # build AbuseIPDB.com request URL for current address + # see: https://docs.abuseipdb.com/#check-endpoint + request_url = "https://api.abuseipdb.com/api/v2/check" + request_headers = {"Key": args.abuseipdb_api_key} + request_params = {"ipAddress": address, "maxAgeInDays": 90} + + request = requests.get( + request_url, headers=request_headers, params=request_params + ) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "→ Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + abuseConfidenceScore = data["data"]["abuseConfidenceScore"] + + print(f"→ {address} has score: {abuseConfidenceScore}") + + row.update({"abuseConfidenceScore": abuseConfidenceScore}) + + writer.writerow(row) + + # check if we hit IPAPI's rate limit + elif request.status_code == 429: + sys.stderr.write(Fore.RED + "Error: hit IPAPI rate limit.\n" + Fore.RESET) + exit(1) + # if request status not 200 OK + else: + sys.stderr.write( + Fore.RED + + f"Error: request failed ({request.status_code}).\n" + + Fore.RESET + ) + exit(1) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the public IPAPI.co API for information associated with a list of IP addresses from a text file." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing IP addresses to resolve.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-k", + "--abuseipdb-api-key", + help="AbuseIPDB.com API key if you want to check whether IPs have been reported.", +) +parser.add_argument( + "-o", + "--output-file", + help="File name to save CSV output.", + required=True, + type=argparse.FileType("w"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +read_addresses_from_file() + +exit() diff --git a/ilri/resolve_addresses_geoip2.py b/ilri/resolve_addresses_geoip2.py new file mode 100755 index 000000000000..5f3cb038cc3b --- /dev/null +++ b/ilri/resolve_addresses_geoip2.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# +# resolve-addresses-geoip2.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the local GeoIP DB for information about IP addresses read from a text +# file. The text file should have one address per line (comments and invalid li- +# nes are skipped). Consults GreyNoise to see if an IP address is known, and can +# optionally look up IPs in the AbuseIPDB.com if you provide an API key. GeoIP +# databases are expected to be here: +# +# - /var/lib/GeoIP/GeoLite2-City.mmdb +# - /var/lib/GeoIP/GeoLite2-ASN.mmdb +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install requests requests-cache colorama geoip2 +# + +import argparse +import csv +import ipaddress +import signal +import sys +from datetime import timedelta + +import geoip2.database +import requests +import requests_cache +from colorama import Fore + + +def valid_ip(address): + try: + ipaddress.ip_address(address) + + return True + + except ValueError: + return False + + +# read IPs from a text file, one per line +def read_addresses_from_file(): + # initialize an empty list for IP addresses + addresses = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # skip any lines that aren't valid IPs + if not valid_ip(line): + continue + + # iterate over results and add addresses that aren't already present + if line not in addresses: + addresses.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_addresses(addresses) + + +def resolve_addresses(addresses): + if args.abuseipdb_api_key: + fieldnames = [ + "ip", + "org", + "network", + "asn", + "country", + "greyNoiseClassification", + "abuseConfidenceScore", + ] + else: + fieldnames = [ + "ip", + "org", + "network", + "asn", + "country", + "greyNoiseClassification", + ] + + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + # cache HTTP 200 responses + requests_cache.install_cache( + "requests-cache", + expire_after=expire_after, + allowable_codes=(200, 404), + ) + + # prune old cache entries + requests_cache.delete() + + # iterate through our addresses + for address in addresses: + print(f"Looking up {address} in GeoIP2") + + # Look up IP information in the City database + with geoip2.database.Reader("/var/lib/GeoIP/GeoLite2-City.mmdb") as reader: + try: + response = reader.city(address) + + address_country = response.country.iso_code + except geoip2.errors.AddressNotFoundError: + pass + + # Look up organization information in the ASN database + with geoip2.database.Reader("/var/lib/GeoIP/GeoLite2-ASN.mmdb") as reader: + try: + response = reader.asn(address) + + address_org = response.autonomous_system_organization + address_net = response.network + address_asn = response.autonomous_system_number + except geoip2.errors.AddressNotFoundError: + if args.debug: + sys.stderr.write( + Fore.YELLOW + "→ IP not in database.\n" + Fore.RESET + ) + + pass + + row = { + "ip": address, + "org": address_org, + "network": address_net, + "asn": address_asn, + "country": address_country, + } + + # Only look up IPv4 addresses in GreyNoise + if isinstance(ipaddress.ip_address(address), ipaddress.IPv4Address): + print(f"→ Looking up {address} in GreyNoise") + + # build greynoise.io request URL for current address + # see: https://docs.greynoise.io/reference/get_v3-community-ip + request_url = f"https://api.greynoise.io/v3/community/{address}" + request_headers = {"Accept": "application/json"} + + request = requests.get(request_url, headers=request_headers) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "→ Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + greyNoiseClassification = data["classification"] + + print(f"→ {address} has classification: {greyNoiseClassification}") + else: + # GreyNoise has not seen this address, so let's just say unknown + greyNoiseClassification = "unknown" + + row.update({"greyNoiseClassification": greyNoiseClassification}) + + if args.abuseipdb_api_key: + print(f"→ Looking up {address} in AbuseIPDB") + + # build AbuseIPDB.com request URL for current address + # see: https://docs.abuseipdb.com/#check-endpoint + request_url = "https://api.abuseipdb.com/api/v2/check" + request_headers = {"Key": args.abuseipdb_api_key} + request_params = {"ipAddress": address, "maxAgeInDays": 90} + + request = requests.get( + request_url, headers=request_headers, params=request_params + ) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "→ Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + abuseConfidenceScore = data["data"]["abuseConfidenceScore"] + + print(f"→ {address} has score: {abuseConfidenceScore}") + + row.update({"abuseConfidenceScore": abuseConfidenceScore}) + + writer.writerow(row) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the public GeoIP2 database for information associated with a list of IP addresses from a text file." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing IP addresses to resolve.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-k", + "--abuseipdb-api-key", + help="AbuseIPDB.com API key if you want to check whether IPs have been reported.", +) +parser.add_argument( + "-o", + "--output-file", + help="File name to save CSV output.", + required=True, + type=argparse.FileType("w"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +read_addresses_from_file() + +exit() diff --git a/ilri/resolve_orcids.py b/ilri/resolve_orcids.py new file mode 100755 index 000000000000..2b3403903a2c --- /dev/null +++ b/ilri/resolve_orcids.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +# +# resolve-orcids.py 1.2.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public ORCID API for names associated with a list of ORCID iDs +# read from a text file or DSpace authority Solr core. Text file should have +# one ORCID identifier per line (comments and invalid lines are skipped). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import logging +import re +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +# read ORCID identifiers from a text file, one per line +def read_identifiers_from_file(): + # initialize an empty list for ORCID iDs + orcids = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # regular expression for matching exactly one ORCID identifier on a line + pattern = re.compile(r"^[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}$") + + # skip the line if it doesn't match the pattern + if not pattern.match(line): + continue + + # iterate over results and add ORCID iDs that aren't already in the list + if line not in orcids: + orcids.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_orcid_identifiers(orcids) + + +# query DSpace's authority Solr core for ORCID identifiers +def read_identifiers_from_solr(): + # simple query from the 'authority' collection 2000 rows at a time (default is 1000) + solr_query_params = {"q": "orcid_id:*", "wt": "json", "rows": 2000} + + solr_url = args.solr_url + "/authority/select" + + res = requests.get(solr_url, params=solr_query_params) + + numFound = res.json()["response"]["numFound"] + logger.debug( + Fore.GREEN + + f"Total number of Solr records with ORCID iDs: {numFound}" + + Fore.RESET + ) + + # initialize an empty list for ORCID iDs + orcids = [] + + docs = res.json()["response"]["docs"] + # iterate over results and add ORCID iDs that aren't already in the list + # for example, we had 1600 ORCID iDs in Solr, but only 600 are unique + for doc in docs: + if doc["orcid_id"] not in orcids: + orcids.append(doc["orcid_id"]) + + # if the user requested --extract-only, write the current ORCID iD to output_file + if args.extract_only: + line = doc["orcid_id"] + "\n" + args.output_file.write(line) + + # exit now if the user requested --extract-only + if args.extract_only: + orcids_extracted = str(len(orcids)) + logger.debug( + Fore.GREEN + + f"Number of unique ORCID identifiers: {orcids_extracted}" + + Fore.RESET + ) + # close output file before we exit + args.output_file.close() + exit() + + resolve_orcid_identifiers(orcids) + + +# Query ORCID's public API for names associated with identifiers. Prefers to use +# the "credit-name" field if it is present, otherwise will default to using the +# "given-names" and "family-name" fields. +def resolve_orcid_identifiers(orcids): + unique_orcids = str(len(orcids)) + logger.debug( + Fore.GREEN + + f"Resolving names associated with {unique_orcids} unique ORCID identifiers.\n" + + Fore.RESET + ) + + # ORCID API endpoint, see: https://pub.orcid.org + orcid_api_base_url = "https://pub.orcid.org/v2.1/" + orcid_api_endpoint = "/person" + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + # cache HTTP 200 and 404 responses, because ORCID uses HTTP 404 when an identifier doesn't exist + requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) + ) + + # prune old cache entries + requests_cache.delete() + + # iterate through our ORCID iDs and fetch their names from the ORCID API + for orcid in orcids: + logger.debug( + Fore.GREEN + + f"Looking up the names associated with ORCID iD: {orcid}" + + Fore.RESET + ) + + # build request URL for current ORCID ID + request_url = orcid_api_base_url + orcid.strip() + orcid_api_endpoint + + # ORCID's API defaults to some custom format, so tell it to give us JSON + request = requests.get(request_url, headers={"Accept": "application/json"}) + + # Check the request status + if request.status_code == requests.codes.ok: + # read response JSON into data + data = request.json() + + # make sure name element is not null + if data["name"]: + # prefer to use credit-name if present and not blank + if ( + data["name"]["credit-name"] + and data["name"]["credit-name"]["value"] != "" + ): + line = data["name"]["credit-name"]["value"] + # otherwise try to use given-names and or family-name + else: + # make sure given-names is present and not deactivated + if ( + data["name"]["given-names"] + and data["name"]["given-names"]["value"] + != "Given Names Deactivated" + ): + line = data["name"]["given-names"]["value"] + else: + logger.debug( + Fore.YELLOW + + "Ignoring null or deactivated given-names element." + + Fore.RESET + ) + # make sure family-name is present and not deactivated + if ( + data["name"]["family-name"] + and data["name"]["family-name"]["value"] + != "Family Name Deactivated" + ): + line = f'{line} {data["name"]["family-name"]["value"]}' + else: + logger.debug( + Fore.YELLOW + + "Ignoring null or deactivated family-name element." + + Fore.RESET + ) + # check if line has something (a credit-name, given-names, and or family-name) + if line and line != "": + line = "{0}: {1}".format(line.strip(), orcid) + else: + logger.debug( + Fore.RED + + "Skipping identifier with no valid name elements." + + Fore.RESET + ) + + continue + + if not args.quiet: + logger.info(line) + + # write formatted name and ORCID identifier to output file + args.output_file.write(f"{line}\n") + + # clear line for next iteration + line = None + else: + logger.debug( + Fore.YELLOW + + "Skipping identifier with null name element." + + Fore.RESET + ) + # HTTP 404 means that the API url or identifier was not found. If the + # API URL is correct, let's assume that the identifier was not found. + elif request.status_code == 404: + logger.debug( + Fore.YELLOW + + "Skipping missing identifier (API request returned HTTP 404)." + + Fore.RESET + ) + + continue + # HTTP 409 means that the identifier is locked for some reason + # See: https://members.orcid.org/api/resources/error-codes + elif request.status_code == 409: + logger.debug( + Fore.YELLOW + + "Skipping locked identifier (API request returned HTTP 409)." + + Fore.RESET + ) + + continue + else: + logger.error(Fore.RED + "Request failed." + Fore.RESET) + # close output file before we exit + args.output_file.close() + sys.exit(1) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description='Query the public ORCID API for names associated with a list of ORCID identifiers, either from a text file or a DSpace authority Solr core. Optional "extract only" mode will simply fetch the ORCID identifiers from Solr and write them to the output file without resolving their names from ORCID\'s API.' +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-e", + "--extract-only", + help="If fetching ORCID identifiers from Solr, write them to the output file without resolving their names from the ORCID API.", + action="store_true", +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print results to screen as we find them (results will still go to output file).", + action="store_true", +) +# group of mutually exclusive options +group = parser.add_mutually_exclusive_group(required=True) +group.add_argument( + "-i", + "--input-file", + help="File name containing ORCID identifiers to resolve.", + type=argparse.FileType("r"), +) +group.add_argument( + "-s", + "--solr-url", + help="URL of Solr application (for example: http://localhost:8080/solr).", +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the ORCID identifiers from there +if args.input_file: + read_identifiers_from_file() +# otherwise, get the ORCID identifiers from Solr +elif args.solr_url: + read_identifiers_from_solr() diff --git a/ilri/rest_find_collections.py b/ilri/rest_find_collections.py new file mode 100755 index 000000000000..092f4524145f --- /dev/null +++ b/ilri/rest_find_collections.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# +# rest-find-collections.py 1.1.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A quick and dirty example for parsing the DSpace REST API to find and print +# the names of all collections contained in a community hierarchy. It expects +# exactly one command line argument: the handle of a community. For example: +# +# $ ./rest-find-collections.py 10568/1 +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install requests colorama +# +# See: https://requests.readthedocs.org/en/master + +import argparse +import signal +import sys + +import requests +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +def parse_community(community_id): + request_url = ( + rest_base_url + + rest_communities_endpoint + + str(community_id) + + "?expand=collections,subCommunities" + ) + try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + "Could not connect to {0}.\n".format(args.rest_url) + Fore.RESET + ) + exit(1) + + if request.status_code == requests.codes.ok: + subcommunities = request.json()["subcommunities"] + collections = request.json()["collections"] + + for subcommunity in subcommunities: + subcommunity_id = subcommunity["uuid"] + + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found subcommunity (id: {subcommunity_id}, handle: {subcommunity_handle}): {subcommunity_name} ==> I must go deeper!\n".format( + subcommunity_id=str(subcommunity_id), + subcommunity_handle=subcommunity["handle"], + subcommunity_name=subcommunity["name"], + ) + + Fore.RESET + ) + + parse_community(subcommunity_id) + + for collection in collections: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found collection (id: {collection_id}, handle: {collection_handle}): {collection_name}\n".format( + collection_id=str(collection["uuid"]), + collection_handle=collection["handle"], + collection_name=collection["name"], + ) + + Fore.RESET + ) + + all_collections.append(collection["name"]) + else: + sys.stderr.write( + Fore.RED + + "Status not ok! Request URL was: {request_url}\n".format( + request_url=request.url + ) + + Fore.RESET + ) + exit(1) + + +parser = argparse.ArgumentParser( + description="Find all collections under a given DSpace community." +) +parser.add_argument("community", help="Community to process, for example: 10568/1") +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-u", + "--rest-url", + help="URL of DSpace REST application.", + default="http://localhost:8080/rest", +) +args = parser.parse_args() + +handle = args.community + +# REST base URL and endpoints (with leading and trailing slashes) +rest_base_url = args.rest_url +rest_handle_endpoint = "/handle/" +rest_communities_endpoint = "/communities/" +rest_collections_endpoint = "/collections/" +rest_user_agent = "Alan Test Python Requests Bot" + +# initialize empty list of all collections +all_collections = [] + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# fetch the metadata for the given handle +request_url = rest_base_url + rest_handle_endpoint + str(handle) + +try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) +except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + "Could not connect to REST API: {0}.\n".format(args.rest_url) + + Fore.RESET + ) + exit(1) + +# Check the request status +if request.status_code == requests.codes.ok: + handle_type = request.json()["type"] + + # Make sure the given handle is a community + if handle_type == "community": + community_id = request.json()["uuid"] + parse_community(community_id) + + for collection in all_collections: + print( + Fore.GREEN + + "Name of collection: {collection}".format(collection=collection) + + Fore.RESET + ) + else: + sys.stderr.write( + Fore.RED + + '{handle} is type "{handle_type}", not community.\n'.format( + handle=handle, handle_type=handle_type + ) + + Fore.RESET + ) + exit(1) +else: + sys.stderr.write( + Fore.RED + + "Request failed. Are you sure {handle} is a valid handle?\n".format( + handle=handle + ) + + Fore.RESET + ) + exit(1) diff --git a/ilri/ror_lookup.py b/ilri/ror_lookup.py new file mode 100755 index 000000000000..776354515600 --- /dev/null +++ b/ilri/ror_lookup.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# +# ror-lookup.py 0.1.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the Research Organization Registry dataset for organizations read +# from a text file. Text file should have one organization per line. Results +# are saved to a CSV including the organization and whether it matched or not. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama +# + +import argparse +import csv +import json +import logging +import signal +import sys + +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + + +# read organizations from a text file, one per line +def read_organizations_from_file(): + # initialize an empty list for organization + organizations = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add organization that aren't already present + if line not in organizations: + organizations.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_organizations(organizations) + + +def resolve_organizations(organizations): + fieldnames = ["organization", "match type", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for organization in organizations: + logger.debug(f"Looking up the organization: {organization}") + + # check for exact match + if organization.lower() in ror_names: + logger.info( + f"{Fore.GREEN}Name match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "name", + "matched": "true", + } + ) + elif organization.lower() in ror_aliases: + logger.info( + f"{Fore.GREEN}Alias match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "alias", + "matched": "true", + } + ) + elif organization.lower() in ror_acronyms: + logger.info( + f"{Fore.GREEN}Acronym match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "acronym", + "matched": "true", + } + ) + else: + logger.debug( + f"{Fore.YELLOW}No match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "", + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the ROR JSON to validate organizations from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Set log level to DEBUG.", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing organizations to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-r", + "--ror-json", + help="ror.json file containing organizations to look up. See: https://doi.org/10.6084/m9.figshare.c.4596503.v5", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# if the user specified an input file, get the organizations from there +if args.input_file and args.ror_json: + ror = json.load(args.ror_json) + + # list comprehension instead of a for loop to extract all names + ror_names = [org["name"].lower() for org in ror] + + # nested list comprehension to extract aliases, think of it like: + # ror_aliases_all = [] + # for org in ror: + # for alias in org['aliases']: + # ror_aliases_all.append(alias) + # + # See: https://stackoverflow.com/questions/18072759/list-comprehension-on-a-nested-list + ror_aliases_all = [alias.lower() for org in ror for alias in org["aliases"]] + # dedupe the list by converting it to a dict and back to a list (dicts can't + # have any duplicate items) + ror_aliases = list(dict.fromkeys(ror_aliases_all)) + # delete the list of all aliases + del ror_aliases_all + + # same for acronyms + ror_acronyms_all = [acronym.lower() for org in ror for acronym in org["acronyms"]] + ror_acronyms = list(dict.fromkeys(ror_acronyms_all)) + del ror_acronyms_all + + read_organizations_from_file() + +exit() diff --git a/ilri/sherpa_issn_lookup.py b/ilri/sherpa_issn_lookup.py new file mode 100755 index 000000000000..524190711159 --- /dev/null +++ b/ilri/sherpa_issn_lookup.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# +# sherpa-issn-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Sherpa API for journal titles using ISSNs read from a +# text file. The text file should have one ISSN per line. +# +# See: https://v2.sherpa.ac.uk/api/object-retrieval-by-id.html +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read journals from a text file, one per line +def read_issns_from_file(): + # initialize an empty list for ISSNs + issns = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add ISSNs that aren't already present + if line not in issns: + issns.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_issns(issns) + + +def resolve_issns(issns): + fieldnames = ["issn", "journal title"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with two weeks expiry + expire_after = timedelta(days=14) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for issn in issns: + if args.debug: + sys.stderr.write(Fore.GREEN + f"Looking up ISSN: {issn}\n" + Fore.RESET) + + request_url = "https://v2.sherpa.ac.uk/cgi/retrieve_by_id" + request_params = { + "item-type": "publication", + "format": "Json", + "api-key": args.api_key, + "identifier": issn, + } + + try: + request = requests.get(request_url, params=request_params) + + data = request.json() + except requests.exceptions.ConnectionError: + sys.stderr.write(Fore.RED + "Connection error.\n" + Fore.RESET) + + # CrossRef responds 404 if a journal isn't found, so we check for an + # HTTP 2xx response here + if request.status_code == requests.codes.ok and len(data["items"]) == 1: + print(f"Exact match for {issn} in Sherpa (cached: {request.from_cache})") + + writer.writerow( + {"issn": issn, "journal title": data["items"][0]["title"][0]["title"]} + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {issn} in Sherpa (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow({"issn": issn, "journal title": ""}) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API to validate ISSNs from a text file." +) +parser.add_argument( + "-a", + "--api-key", + help="Sherpa API KEY.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing ISSNs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the ISSNs from there +if args.input_file: + read_issns_from_file() + +exit() diff --git a/ilri/subdivision_lookup.py b/ilri/subdivision_lookup.py new file mode 100755 index 000000000000..0591ac29ed1c --- /dev/null +++ b/ilri/subdivision_lookup.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# subdivision-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the pycountry ISO 3166-2 dataset for subdivisions read from a text +# file. Text file should have one subdivision per line. Results are saved to +# a CSV including the subdivision and whether it matched or not. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama pycountry +# + +import argparse +import csv +import signal +import sys + +import pycountry +from colorama import Fore + + +# read subdivisions from a text file, one per line +def read_subdivisions_from_file(): + # initialize an empty list for subdivisions + subdivisions = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add subdivisions that aren't already present + if line not in subdivisions: + subdivisions.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_subdivisions(subdivisions) + + +def resolve_subdivisions(subdivisions): + fieldnames = ["subdivision", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for subdivision in subdivisions: + if args.debug: + sys.stderr.write( + Fore.GREEN + f"Looking up the subdivision: {subdivision}\n" + Fore.RESET + ) + + # check for exact match + if subdivision.lower() in subdivision_names: + print(f"Match for {subdivision!r}") + + writer.writerow( + { + "subdivision": subdivision, + "matched": "true", + } + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + f"No match for {subdivision!r}\n" + Fore.RESET + ) + + writer.writerow( + { + "subdivision": subdivision, + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query pycountry's ISO 3166-2 list to validate subdivisions from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing subdivisions to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# list comprehension instead of a for loop to extract all subdivision names +subdivision_names = [subdivision.name.lower() for subdivision in pycountry.subdivisions] + +read_subdivisions_from_file() + +exit() diff --git a/ilri/update_orcids.py b/ilri/update_orcids.py new file mode 100755 index 000000000000..70e92389c7cf --- /dev/null +++ b/ilri/update_orcids.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# +# update-orcids.py v0.1.4 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a text file with author names and ORCID identifiers in the following +# format: +# +# Jose Polania: 0000-0002-1186-0503 +# Joseph Fargione: 0000-0002-0636-5380 +# Joseph M. Sandro: 0000-0002-8311-2299 +# +# Will check existing ORCID metadata to make sure they use the author's latest +# name format. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama +# + +import argparse +import logging +import re +import signal +import sys + +import util +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Update ORCID records in the DSpace PostgreSQL database." +) +parser.add_argument( + "-i", + "--input-file", + help='Path to input file containing ORCIDs in format "Alan S. Orth: 0000-0002-1735-7458".', + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +cursor = conn.cursor() + +# Use read().splitlines() so we don't get newlines after each line, though I'm +# not sure if we should also be stripping? +for line in args.input_file.read().splitlines(): + # extract the ORCID identifier from the current line + orcid_identifier_pattern = re.compile( + r"[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}" + ) + orcid_identifier_match = orcid_identifier_pattern.search(line) + + # sanity check to make sure we extracted the ORCID identifier + if orcid_identifier_match is None: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f'Skipping invalid ORCID identifier in "{line}".\n' + + Fore.RESET + ) + continue + + # we only expect one ORCID identifier, so if it matches it will be group "0" + # see: https://docs.python.org/3/library/re.html + orcid_identifier = orcid_identifier_match.group(0) + + metadata_field_id = util.field_name_to_field_id(cursor, "cg.creator.identifier") + + # note that the SQL here is quoted differently to allow us to use + # LIKE with % wildcards with our paremeter subsitution + sql = "SELECT text_value, dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value LIKE '%%' || %s || '%%' AND text_value!=%s" + cursor.execute(sql, (metadata_field_id, orcid_identifier, line)) + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + if args.dry_run: + if cursor.rowcount > 0 and not args.quiet: + logger.info( + Fore.GREEN + + f"(DRY RUN) Fixed {cursor.rowcount} occurences of: {line}" + + Fore.RESET + ) + else: + sql = "UPDATE metadatavalue SET text_value=%s WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value LIKE '%%' || %s || '%%' AND text_value!=%s" + cursor.execute( + sql, + ( + line, + metadata_field_id, + orcid_identifier, + line, + ), + ) + + if cursor.rowcount > 0 and not args.quiet: + logger.info( + Fore.GREEN + + f"Fixed {cursor.rowcount} occurences of: {line}" + + Fore.RESET + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[1]) + + +# commit changes when we're done +if not args.dry_run: + conn.commit() + +# close database connection before we exit +conn.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/util.py b/ilri/util.py new file mode 100644 index 000000000000..edc6a36cdf1a --- /dev/null +++ b/ilri/util.py @@ -0,0 +1,159 @@ +# util.py v0.0.5 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Various helper functions for CGSpace DSpace Python scripts. +# + +import gzip +import os +import re +import shutil +import sys + +import psycopg +import requests +import requests_cache +from colorama import Fore + + +def field_name_to_field_id(cursor, metadata_field: str): + """Return the metadata_field_id for a given metadata field. + + TODO: handle case where schema doesn't exist + TODO: handle case where metadata field doesn't exist + + :param cursor: a psycopg cursor with an active database session. + :param metadata_field: the metadata field, for example "dcterms.title". + :returns int + """ + + if len(metadata_field.split(".")) == 3: + schema, element, qualifier = metadata_field.split(".") + elif len(metadata_field.split(".")) == 2: + schema, element = metadata_field.split(".") + qualifier = None + + # First we need to get the schema ID + sql = "SELECT metadata_schema_id FROM metadataschemaregistry WHERE short_id=%s;" + # Syntax looks weird here, but the second argument must always be a sequence + # See: https://www.psycopg.org/docs/usage.html + cursor.execute(sql, [schema]) + + if cursor.rowcount > 0: + metadata_schema_id = cursor.fetchone()[0] + + # Now we can get the metadata field ID, paying attention to whether the + # field has a qualifier or not. + if qualifier: + sql = "SELECT metadata_field_id FROM metadatafieldregistry WHERE metadata_schema_id=%s AND element=%s AND qualifier=%s;" + cursor.execute(sql, [metadata_schema_id, element, qualifier]) + else: + sql = "SELECT metadata_field_id FROM metadatafieldregistry WHERE metadata_schema_id=%s AND element=%s" + cursor.execute(sql, [metadata_schema_id, element]) + + if cursor.rowcount > 0: + metadata_field_id = cursor.fetchone()[0] + + return metadata_field_id + + +def update_item_last_modified(cursor, dspace_object_id: str): + """Update an item's last_modified timestamp. + + :param cursor: a psycopg cursor with an active database session. + :param dspace_object_id: dspace_object_id of the item to update. + """ + + sql = "UPDATE item SET last_modified=NOW() WHERE uuid=%s;" + # Syntax looks weird here, but the second argument must always be a sequence + # See: https://www.psycopg.org/docs/usage.html + cursor.execute(sql, [dspace_object_id]) + + +def db_connect( + database_name: str, database_user: str, database_pass: str, database_host: str +): + """Connect to a PostgreSQL database. + + :param database_name: a string containing the database name. + :param database_user: a string containing the database user. + :param database_pass: a string containing the database pass. + :param database_host: a string containing the database host. + :returns psycopg connection + """ + + try: + conn = psycopg.connect( + f"dbname={database_name} user={database_user} password={database_pass} host={database_host}" + ) + except psycopg.OperationalError: + sys.stderr.write(Fore.RED + "Could not connect to database.\n" + Fore.RESET) + sys.exit(1) + + return conn + + +def read_dois_from_file(input_file) -> list: + """Read DOIs from a file. + + DOIs should be one per line with either http, https, dx.doi.org, doig.org + or just the DOI itself. Anything other than the DOI will be stripped. + + :param input_file: a file handle (class _io.TextIOWrapper ???). + :returns list of DOIs + """ + + # initialize an empty list for DOIs + dois = [] + + for line in input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # trim http://, https://, etc to make sure we only get the DOI component + line = re.sub(r"^https?://(dx\.)?doi\.org/", "", line) + + # iterate over results and add DOIs that aren't already present + if line not in dois: + dois.append(line) + + # close input file before we exit + input_file.close() + + return dois + + +def download_file(url, filename) -> bool: + # Disable cache for streaming downloads + # See: https://github.com/requests-cache/requests-cache/issues/75 + with requests_cache.disabled(): + r = requests.get(url, stream=True, allow_redirects=True) + + # Download failed for some reason + if not r.ok: + return False + + with open(filename, "wb") as f: + # Make sure we handle zipped content. Note: this is not transport + # compression, which is handled automatically by requests. + try: + content_encoding = r.headers["Content-Encoding"] + except KeyError: + content_encoding = None + + if content_encoding == "gzip": + gzip_file = gzip.GzipFile(fileobj=r.raw) + shutil.copyfileobj(gzip_file, f) + else: + shutil.copyfileobj(r.raw, f) + + # Check whether the file was written to disk after downloading + if os.path.isfile(filename): + return True + else: + return False From 35ab9e829d0674f9e9c90e3745cfe12eea3db2f6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 8 Jan 2024 15:54:42 +0300 Subject: [PATCH 071/119] dspace/config: remove subregion browse It is not helpful at all because the values are uncontrolled. --- dspace/config/dspace.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 8a311054ad9d..c2b9213f719b 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1167,8 +1167,7 @@ webui.browse.index.3 = title:item:title webui.browse.index.4 = subject:metadata:dc.subject.*\,dcterms.subject:text webui.browse.index.5 = region:metadata:cg.coverage.region:text webui.browse.index.6 = country:metadata:cg.coverage.country:text -webui.browse.index.7 = subregion:metadata:cg.coverage.subregion:text -webui.browse.index.8 = itemtype:metadata:dcterms.type:text +webui.browse.index.7 = itemtype:metadata:dcterms.type:text ## example of authority-controlled browse category - see authority control config #webui.browse.index.5 = lcAuthor:metadataAuthority:dc.contributor.author:authority From 980eec30ef897def73123fa06c6207ab3417c310 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 9 Jan 2024 12:37:49 +0300 Subject: [PATCH 072/119] dspace/config: update ORCID identifiers Sorted in vim with `sort i` and formatted with tidy: $ tidy -xml -utf8 -m -iq -w 0 dspace/config/controlled-vocabularies/cg-creator-identifier.xml --- .../cg-creator-identifier.xml | 6693 +++++++++++++---- 1 file changed, 5290 insertions(+), 1403 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index b081b31883d1..14001271a8ff 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xmlrom ad1a69b2dc79b422046a395e2c39b76b375c6dc1 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 9 Jan 2024 21:29:22 +0300 Subject: [PATCH 073/119] dspace/solr: Increase maxBooleanClauses for all cores This solves an issue when a user belongs to too many groups, which causes the Solr query to have too many "OR" clauses when searching for communities and collections they have access to. We only need the increased value in the search core, but one comment notes that this setting modifies an internal Lucene variable globally, so it can get reset depending on the order cores are loaded in. The error in the DSpace log is: too many boolean clauses. --- dspace/solr/authority/conf/solrconfig.xml | 2 +- dspace/solr/oai/conf/solrconfig.xml | 2 +- dspace/solr/search/conf/solrconfig.xml | 2 +- dspace/solr/statistics/conf/solrconfig.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dspace/solr/authority/conf/solrconfig.xml b/dspace/solr/authority/conf/solrconfig.xml index 21f917ebf8ca..10d503a260b9 100644 --- a/dspace/solr/authority/conf/solrconfig.xml +++ b/dspace/solr/authority/conf/solrconfig.xml @@ -50,7 +50,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/dspace/solr/oai/conf/solrconfig.xml b/dspace/solr/oai/conf/solrconfig.xml index ce8d9ebe2060..30d1cd055861 100644 --- a/dspace/solr/oai/conf/solrconfig.xml +++ b/dspace/solr/oai/conf/solrconfig.xml @@ -59,7 +59,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/dspace/solr/search/conf/solrconfig.xml b/dspace/solr/search/conf/solrconfig.xml index 97b1d1ddbbf6..2a200dc9c172 100644 --- a/dspace/solr/search/conf/solrconfig.xml +++ b/dspace/solr/search/conf/solrconfig.xml @@ -70,7 +70,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/dspace/solr/statistics/conf/solrconfig.xml b/dspace/solr/statistics/conf/solrconfig.xml index 2b1cff45373d..abfb7d7e17b2 100644 --- a/dspace/solr/statistics/conf/solrconfig.xml +++ b/dspace/solr/statistics/conf/solrconfig.xml @@ -59,7 +59,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} From 35395e8f1f095a6d63c7f77dccb5b1522d985890 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 10 Jan 2024 09:36:40 +0300 Subject: [PATCH 074/119] dspace/config: add controlled vocabulary for publishers This is based on the current top ~100 publishers in CGSpace in addition to all CGIAR centers, Initiatives, and Impact platforms. --- .../dcterms-publisher.xml | 133 ++++++++++++++++++ dspace/config/dspace.cfg | 2 +- dspace/config/submission-forms.xml | 5 +- 3 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 dspace/config/controlled-vocabularies/dcterms-publisher.xml diff --git a/dspace/config/controlled-vocabularies/dcterms-publisher.xml b/dspace/config/controlled-vocabularies/dcterms-publisher.xml new file mode 100644 index 000000000000..d35c3fc29442 --- /dev/null +++ b/dspace/config/controlled-vocabularies/dcterms-publisher.xml @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index c2b9213f719b..e7edc5965540 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -1176,7 +1176,7 @@ webui.browse.index.7 = itemtype:metadata:dcterms.type:text # vocabularies in the submission forms. These could be disabled adding the name of # the vocabularies to exclude in this comma-separated property. # (Requires reboot of servlet container, e.g. Tomcat, to reload) -webui.browse.vocabularies.disabled = srsc, dcterms-subject, dc-contributor-author, cg-contributor-donor, cg-contributor-affiliation +webui.browse.vocabularies.disabled = srsc, dcterms-subject, dc-contributor-author, cg-contributor-donor, cg-contributor-affiliation, dcterms-publisher # Enable/Disable tag cloud in browsing. diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 3d5bc0b41aa1..c1477ba354f5 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -245,11 +245,12 @@ dcterms publisher - false + true onebox - Enter the full name of the publisher. + Enter the full name of the publisher. Click to see a pre-populated list of common commercial and CGIAR publishers or enter a new one. + dcterms-publisher From c7f3963f6acaa31bd1e6a0acd9a8245c2e067ddb Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 14 Jan 2024 21:26:11 +0300 Subject: [PATCH 075/119] dspace/config: update affiliations Top 1,000 or so existing affiliations on CGSpace, plus all current Initiatives and Impact Platforms. --- .../cg-contributor-affiliation.xml | 2023 +++++++++-------- 1 file changed, 1029 insertions(+), 994 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml index 809ba58df1ae..6f6ae5bb79dc 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xmlrom b4cf167ffe1eb86098ac0c3ea9024afae320fffb Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 16 Jan 2024 15:08:22 +0300 Subject: [PATCH 076/119] dspace/config: update author controlled vocab --- dspace/config/controlled-vocabularies/dc-contributor-author.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index f5ea865e4ca6..a4e4c94a6e33 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xml @@ -998,7 +998,7 @@ - + From 7db5f69b1f4bf4fb4d0aa109a8f710722c4c3489 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 18 Jan 2024 20:30:57 +0300 Subject: [PATCH 077/119] dspace/config: update Initiative name The Fruit and Vegetables Initiative started using their long name on 2023-08. I updated it everywhere but here! --- dspace/config/submission-forms.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index c1477ba354f5..de09cbdf9b0f 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -2862,8 +2862,8 @@ Fragility to Resilience in Central and West Asia and North Africa - Fruits and Vegetables - Fruits and Vegetables + Fruit and Vegetables for Sustainable Healthy Diets + Fruit and Vegetables for Sustainable Healthy Diets Gender Equality From 01a8fe261d99170982a4dd291906efe2ec723f5f Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 22 Jan 2024 12:45:03 +0300 Subject: [PATCH 078/119] dspace/config: update author Author requested we update his name format on CGSpace. --- dspace/config/controlled-vocabularies/dc-contributor-author.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index a4e4c94a6e33..ce0b65151362 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xml @@ -584,7 +584,7 @@ - + From 74cd120a4af86964ea12b9d3247fdf8697930a44 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 23 Jan 2024 12:47:39 +0300 Subject: [PATCH 079/119] ilri: set connection read_only for dry run Python script interacting with the database can be read only when they are running in dry run mode. --- ilri/delete_metadata_values.py | 5 ++++- ilri/fix_metadata_values.py | 5 ++++- ilri/update_orcids.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ilri/delete_metadata_values.py b/ilri/delete_metadata_values.py index 7e21afd2a9f9..6be47b093b70 100755 --- a/ilri/delete_metadata_values.py +++ b/ilri/delete_metadata_values.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# delete-metadata-values.py 1.2.4 +# delete-metadata-values.py 1.2.5 # # Copyright Alan Orth. # @@ -95,6 +95,9 @@ def signal_handler(signal, frame): args.database_name, args.database_user, args.database_pass, "localhost" ) +if args.dry_run: + conn.read_only = True + cursor = conn.cursor() for row in reader: diff --git a/ilri/fix_metadata_values.py b/ilri/fix_metadata_values.py index 8ede1cd063dc..0f765747279f 100755 --- a/ilri/fix_metadata_values.py +++ b/ilri/fix_metadata_values.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# fix-metadata-values.py v1.2.6 +# fix-metadata-values.py v1.2.7 # # Copyright Alan Orth # @@ -119,6 +119,9 @@ def signal_handler(signal, frame): args.database_name, args.database_user, args.database_pass, "localhost" ) +if args.dry_run: + conn.read_only = True + cursor = conn.cursor() for row in reader: diff --git a/ilri/update_orcids.py b/ilri/update_orcids.py index 70e92389c7cf..63248abb10dd 100755 --- a/ilri/update_orcids.py +++ b/ilri/update_orcids.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# update-orcids.py v0.1.4 +# update-orcids.py v0.1.5 # # Copyright Alan Orth. # @@ -91,6 +91,9 @@ def signal_handler(signal, frame): args.database_name, args.database_user, args.database_pass, "localhost" ) +if args.dry_run: + conn.read_only = True + cursor = conn.cursor() # Use read().splitlines() so we don't get newlines after each line, though I'm From cf63976f3a0bb31de6cd59d54157f5cee30b2487 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 23 Jan 2024 13:59:24 +0300 Subject: [PATCH 080/119] dspace/config: update ORCID identifiers Sorted in vim with `sort i` and formatted with tidy: $ tidy -xml -utf8 -m -iq -w 0 dspace/config/controlled-vocabularies/cg-creator-identifier.xml --- .../cg-creator-identifier.xml | 10960 ++++++++-------- 1 file changed, 5681 insertions(+), 5279 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index 14001271a8ff..54d2f747420d 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml @@ -12,5284 +12,5686 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 4eb31f2208af84604eebedfafabdf88f123cc50f Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 24 Jan 2024 12:53:06 +0300 Subject: [PATCH 081/119] dspace/config: add cg.place to submission-forms.xml This got lost in the transition to DSpace 7. --- dspace/config/submission-forms.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index de09cbdf9b0f..4b45bc3a1ece 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -435,6 +435,7 @@ true + dropdown Select the language of the main content of the item. If the language does not appear in the list, please select 'Other'. If the content does not really have a language (for example, if it @@ -442,6 +443,17 @@ + + cg + place + + false + + + onebox + NOT for journal articles. Enter the city and country, for example: Nairobi, Kenya + + From 7e5252a0076126ee9d9ec0964ceb9633887965f2 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 26 Jan 2024 16:52:07 +0300 Subject: [PATCH 082/119] dspace/config: adjust submission form Put Initiatives and Impact Areas on the same line. --- dspace/config/submission-forms.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 4b45bc3a1ece..dcd57f4156f2 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -467,6 +467,7 @@ initiative true + dropdown Select any CGIAR Initiatives(s) associated with this item. Use this to show that an Initiative funded this item. @@ -479,6 +480,7 @@ impactPlatform true + dropdown Select any CGIAR Impact Platforms associated with this item. From b3863e2fe46d686fe561f787ef9d539b894a8bbf Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 26 Jan 2024 18:22:10 +0300 Subject: [PATCH 083/119] dspace/config: add project identifier to submission form --- dspace/config/submission-forms.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index dcd57f4156f2..3a6db81ba6aa 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -521,12 +521,24 @@ + + cg + identifier + project + true + + + onebox + Unique identifier for a project associated with this item. + + cg contributor crp true + dropdown Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. From eba4a01d00ff929043c6d902802747937dc62f1a Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 26 Jan 2024 23:41:21 +0300 Subject: [PATCH 084/119] dspace/config: add IFPRI publication rank to submission form --- dspace/config/submission-forms.xml | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 3a6db81ba6aa..b5d916bc3362 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -247,11 +247,23 @@ true + onebox Enter the full name of the publisher. Click to see a pre-populated list of common commercial and CGIAR publishers or enter a new one. dcterms-publisher + + cg + identifier + publicationRank + false + + + dropdown + IFPRI publication rank. + + @@ -3182,6 +3194,28 @@ Systems Transformation + + + A Plus + A Plus + + + A + A + + + B + B + + + C + C + + + Not ranked + Not ranked + + ACP From 1bc3c9c8de03cd9ce94379d0ce7dd0c59f45a813 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 1 Feb 2024 14:56:14 +0300 Subject: [PATCH 085/119] dspace/config: remove incorrect ORCID identifier --- dspace/config/controlled-vocabularies/cg-creator-identifier.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index 54d2f747420d..8823580de2d5 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml @@ -619,7 +619,6 @@ - From 5a6c1cd7d959c7ef4203a5bcc251c4e2c2703994 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 1 Feb 2024 20:52:36 +0300 Subject: [PATCH 086/119] dspace/config: update sponsor hint in submission form --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index b5d916bc3362..12875fbe2a72 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -136,7 +136,7 @@ onebox cg-contributor-donor - Enter full institution name(s) who sponsored the item. Click below to see a pre-populated list, or add one manually. CGIAR Research Program funding is identified in the field above. + Enter full institution name(s) who sponsored the item. Click above to select from a pre-populated list, or add manually. Outputs of CGIAR platforms and initiatives are always sponsored by 'CGIAR Trust Fund'. From 56d150738f9db9bf08d339a27eccc79df8562387 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 1 Feb 2024 20:58:50 +0300 Subject: [PATCH 087/119] dspace/config: move common languages in submission form Move English, French, and Spanish to the top of the controlled list in the submission form. --- dspace/config/submission-forms.xml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 12875fbe2a72..4ded3313d00b 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -2105,6 +2105,18 @@ "en_US", "en", "es", "de", "fr", "it", "ja", "zh", "other", "" --> + + English + en + + + French + fr + + + Spanish + es + Afar aa @@ -2249,10 +2261,6 @@ Modern Greek (1453-) el - - English - en - Esperanto eo @@ -2285,10 +2293,6 @@ Finnish fi - - French - fr - Western Frisian fy @@ -2693,10 +2697,6 @@ Southern Sotho st - - Spanish - es - Albanian sq From f160bd2e416784c50fab3cf9b94528cc04885ac3 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 1 Feb 2024 21:00:47 +0300 Subject: [PATCH 088/119] dspace/config: remove some old donors from vocabulary --- dspace/config/controlled-vocabularies/cg-contributor-donor.xml | 2 -- 1 file changed, 2 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml index bde44dc2b2bf..5c76cc0983fa 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml @@ -114,8 +114,6 @@ - - From 8e9d82c008d2b0725da5d5b0ad2a5fba6d5d6f5c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 1 Feb 2024 21:04:10 +0300 Subject: [PATCH 089/119] dspace/config: minor update on submission form hint --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 4ded3313d00b..630aee237b7d 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -112,7 +112,7 @@ cg-creator-identifier onebox - Enter ORCID identifiers (for CGIAR authors at least). Click below to see a pre-populated list of author ORCID identifiers that you can select from. Enter one per author. If an identifier is missing, enter a new one in the same format (Name: 0000-0002-1735-7458). Use the exact name style the author uses at https://orcid.org + Enter ORCID identifiers (for CGIAR authors at least). Click above to select from a pre-populated list of author ORCID identifiers. Enter one per author. If an identifier is missing, enter a new one in the same format (Name: 0000-0002-1735-7458). Use the exact name style the author uses at https://orcid.org From 8aa0b3d6f8c2218a19970d867a0821a533242d36 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 2 Feb 2024 08:28:46 +0300 Subject: [PATCH 090/119] dspace/config: remove duplicate ORCID identifier --- dspace/config/controlled-vocabularies/cg-creator-identifier.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index 8823580de2d5..a86274fc3b0e 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml @@ -3034,7 +3034,6 @@ - From f91e366112d6a0956d25769fdb72257ebe591b14 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 6 Feb 2024 11:27:36 +0300 Subject: [PATCH 091/119] dspace/config: update controlled vocabulary for ORCID identifiers Remove some incorrect ones based on clustering in OpenRefine and checking their profiles on orcid.org. --- .../cg-creator-identifier.xml | 10985 ++++++++-------- 1 file changed, 5483 insertions(+), 5502 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index a86274fc3b0e..d49d917e362b 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xmlrom c24949c65f2237cf8b41f0f5cdea6177d282c713 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 8 Feb 2024 11:19:57 +0300 Subject: [PATCH 092/119] dspace/config: add controlled vocabulary for project identifier For now only IFPRI is using these. --- .../cg-identifier-project.xml | 127 ++++++++++++++++++ dspace/config/submission-forms.xml | 1 + 2 files changed, 128 insertions(+) create mode 100644 dspace/config/controlled-vocabularies/cg-identifier-project.xml diff --git a/dspace/config/controlled-vocabularies/cg-identifier-project.xml b/dspace/config/controlled-vocabularies/cg-identifier-project.xml new file mode 100644 index 000000000000..54aa3ce970f3 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-identifier-project.xml @@ -0,0 +1,127 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 630aee237b7d..41dab3cf8184 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -541,6 +541,7 @@ onebox + cg-identifier-project Unique identifier for a project associated with this item. From f12f9d620cc4bd009586d03fd6a0f650858fbc91 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 9 Feb 2024 21:00:40 +0300 Subject: [PATCH 093/119] dspace/config: only show ISSN for journal articles Use type-bind to only show ISSN for journal articles and data papers. --- dspace/config/submission-forms.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 41dab3cf8184..84028d65d066 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -403,6 +403,7 @@ onebox Enter the ISSN for the serial publication where this item appears, for example: 2049-3630 + Journal Article,Data Paper From 898b13cf9836666070dd3ffd8ddf4b5f928531d8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 9 Feb 2024 21:01:15 +0300 Subject: [PATCH 094/119] dspace/config: minor tweak to related reference Minor tweak to language. --- dspace/config/submission-forms.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 84028d65d066..3dac30b90054 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -741,7 +741,7 @@ relation true - + onebox Enter related reference link (normally a URL to another item). From 8f8ca8c3b456e3ca6c049b78b13069accc99df17 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 22 Feb 2024 20:33:20 +0300 Subject: [PATCH 095/119] submission-forms.xml: re-work some URL fields We decided to re-work the way we tag datasets: - If a dataset has a DOI, it should be entered in the DOI field - Otherwise, it should be entered in the normal URL field - The "cg.identifier.dataurl" field will be for "related" datasets --- dspace/config/submission-forms.xml | 31 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 3dac30b90054..2e432f904bc1 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -413,9 +413,9 @@ url true - + onebox - Enter a full URL for the item, for example a website, blog post, Google Books URL, etc (do not use this for links to datasets, use data URL instead). + Enter an official external URL for this item. For example: a link to the website, blog post, dataset, etc itself. Do not enter a DOI here. @@ -424,20 +424,9 @@ doi false - - onebox - Enter the full address in format: https://doi.org/10.1038/s41598-019-43406-0 - - - - cg - identifier - dataurl - true - - + onebox - A URL for any associated data file(s), in a repository for example. + If this item has a DOI, enter the full address here. For example: https://doi.org/10.1038/s41598-019-43406-0 @@ -742,10 +731,22 @@ true + onebox Enter related reference link (normally a URL to another item). + + cg + identifier + dataurl + true + + + onebox + Enter URL for any associated data file(s), in a repository for example. + + From 4625cbac72cef70a31b6e6f3826384d6932e5474 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 26 Feb 2024 11:09:00 +0300 Subject: [PATCH 096/119] dspace: fix author name in controlled vocabulary --- dspace/config/controlled-vocabularies/dc-contributor-author.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index ce0b65151362..dc910cf8a4df 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xml @@ -206,7 +206,7 @@ - + From d7c24c8cf4971c805a1f9ebd1fce38da37b24cc3 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 27 Feb 2024 15:31:52 +0300 Subject: [PATCH 097/119] dspace/config: fix IFPRI project identifier --- dspace/config/controlled-vocabularies/cg-identifier-project.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/cg-identifier-project.xml b/dspace/config/controlled-vocabularies/cg-identifier-project.xml index 54aa3ce970f3..194af4e818ef 100644 --- a/dspace/config/controlled-vocabularies/cg-identifier-project.xml +++ b/dspace/config/controlled-vocabularies/cg-identifier-project.xml @@ -55,7 +55,7 @@ - + From 7c396f39784a749595edc08f75ab16e2450df06c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 27 Feb 2024 15:50:03 +0300 Subject: [PATCH 098/119] dspace/config: update author controlled vocabulary IFPRI sent me a list of their authors so I combined it with ours. --- .../dc-contributor-author.xml | 3261 +++++++++-------- 1 file changed, 1763 insertions(+), 1498 deletions(-) diff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index dc910cf8a4df..2d106f03fe1a 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xmlrom bd52848e18518b68b9fb5f836fa0f7922ccd374a Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 29 Feb 2024 09:19:33 +0300 Subject: [PATCH 099/119] dspace/config: update project identifiers --- .../cg-identifier-project.xml | 193 +++++++++--------- 1 file changed, 97 insertions(+), 96 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-identifier-project.xml b/dspace/config/controlled-vocabularies/cg-identifier-project.xml index 194af4e818ef..1fefb08eced1 100644 --- a/dspace/config/controlled-vocabularies/cg-identifier-project.xml +++ b/dspace/config/controlled-vocabularies/cg-identifier-project.xml @@ -2,60 +2,60 @@ - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + @@ -75,53 +75,54 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + From dd72d9bcfcd4a3314d36de327d66f923f9a17977 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 1 Mar 2024 18:41:36 +0300 Subject: [PATCH 100/119] dspace: disable controlled vocabulary for journal There is a bug with onebox fields that use controlled vocabularies and are not repeatable. --- dspace/config/submission-forms.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/dspace/config/submission-forms.xml b/dspace/config/submission-forms.xml index 2e432f904bc1..a43b52e7365e 100644 --- a/dspace/config/submission-forms.xml +++ b/dspace/config/submission-forms.xml @@ -159,7 +159,6 @@ false - cg-journal onebox Enter the full journal title. Journal Article,Data Paper From fe60e6a164c14b71d99df4ed79d055a2690fd420 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 3 Mar 2024 21:02:12 +0300 Subject: [PATCH 101/119] dspace: update controlled vocabulary for affiliation Remove a few incorrect and duplicate affiliations. --- .../controlled-vocabularies/cg-contributor-affiliation.xml | 3 --- 1 file changed, 3 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml index 6f6ae5bb79dc..1be4a9c823e7 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml @@ -403,13 +403,11 @@ - - @@ -428,7 +426,6 @@ - From 796e67135f86c4e0b253243ce1fed2f974fd3d26 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 5 Mar 2024 18:01:55 +0300 Subject: [PATCH 102/119] dspace: remove duplicate authors I don't know why these are here, I fixed them twice already grrr. --- dspace/config/controlled-vocabularies/dc-contributor-author.xml | 2 -- 1 file changed, 2 deletions(-) diff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index 2d106f03fe1a..257f997a0a88 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xml @@ -254,7 +254,6 @@ - @@ -694,7 +693,6 @@ - From 467adbebc7cad7aa45ecb97edc1c1529137c5158 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Mar 2024 11:12:44 +0300 Subject: [PATCH 103/119] dspace/config: update UNICEF in controlled vocabularies The name has been United Nations Children's Fund since 1953. :) See: https://www.unicef.org/about-unicef/frequently-asked-questions#3 --- .../controlled-vocabularies/cg-contributor-affiliation.xml | 2 +- dspace/config/controlled-vocabularies/cg-contributor-donor.xml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml index 1be4a9c823e7..522530f8a9d8 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml @@ -743,11 +743,11 @@ + - diff --git a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml index 5c76cc0983fa..7bcedd05112e 100644 --- a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml +++ b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml @@ -721,7 +721,6 @@ - From c900e5c226aa6caab8f04ef0fa79684232566ed8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Mar 2024 16:23:26 +0300 Subject: [PATCH 104/119] ilri/generate_thumbnails.py: use webp for thumbnails --- ilri/generate_thumbnails.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py index 8d02f92d969e..408ff9a7b877 100755 --- a/ilri/generate_thumbnails.py +++ b/ilri/generate_thumbnails.py @@ -34,10 +34,10 @@ def signal_handler(signal, frame): sys.exit(1) -# Process thumbnails from filename.pdf to filename.jpg using libvips. Equivalent +# Process thumbnails from filename.pdf to filename.webp using libvips. Equivalent # to the following shell invocation: # -# vipsthumbnail 64661.pdf -s 600 -o '%s.jpg[Q=85,optimize_coding,strip]' +# vipsthumbnail 64661.pdf -s 600 -o '%s.webp[Q=89,strip]' # # vips is faster than GraphicsMagick/ImageMagick, uses less memory, and seems # to generate better quality images. Note that libvips uses poppler instead of @@ -47,7 +47,7 @@ def signal_handler(signal, frame): # See: https://github.com/libvips/libvips/issues/379 def create_thumbnail(row): filename = row[args.filename_field_name] - thumbnail = os.path.splitext(filename)[0] + ".jpg" + thumbnail = os.path.splitext(filename)[0] + ".webp" # check if the file has been downloaded if not os.path.isfile(filename): if args.debug: @@ -65,7 +65,7 @@ def create_thumbnail(row): vips_image = pyvips.Image.new_from_file(filename, access="sequential") # Set max height to 600px vips_thumbnail = vips_image.thumbnail_image(600) - vips_thumbnail.jpegsave(thumbnail, Q=85, optimize_coding=True, strip=True) + vips_thumbnail.webpsave(thumbnail, Q=89, strip=True) return From 5b57db5c1641a4d4d98852d7816029c93c501625 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Mar 2024 16:25:01 +0300 Subject: [PATCH 105/119] ilri/generate_thumbnails.py: minor optimization Use a list comprehension to filter out rows that are missing a url or filename. --- ilri/generate_thumbnails.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py index 408ff9a7b877..03c3786f7332 100755 --- a/ilri/generate_thumbnails.py +++ b/ilri/generate_thumbnails.py @@ -171,7 +171,9 @@ def download_bitstream(row): ) sys.exit(1) - for row in reader: + rows_to_process = [row for row in reader if row[args.url_field_name] and row[args.filename_field_name]] + + for row in rows_to_process: download_bitstream(row) if args.download_only is not True: From cc58f3c25ec4dc935c4a1d80befe6abc534c5df1 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Mar 2024 16:58:13 +0300 Subject: [PATCH 106/119] ilri/generate_thumbnails.py: use f-strings --- ilri/generate_thumbnails.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py index 03c3786f7332..aa77f7a553b1 100755 --- a/ilri/generate_thumbnails.py +++ b/ilri/generate_thumbnails.py @@ -51,17 +51,15 @@ def create_thumbnail(row): # check if the file has been downloaded if not os.path.isfile(filename): if args.debug: - print(Fore.YELLOW + "> Missing {}.\n".format(filename) + Fore.RESET) + print(f"{Fore.YELLOW}> Missing {filename}.{Fore.RESET}") # check if we already have a thumbnail elif os.path.isfile(thumbnail): if args.debug: print( - Fore.YELLOW - + f"> Thumbnail for {filename} already exists.\n" - + Fore.RESET + f"{Fore.YELLOW}> Thumbnail for {filename} already exists.{Fore.RESET}" ) else: - print(Fore.GREEN + f"> Creating thumbnail for {filename}..." + Fore.RESET) + print(f"{Fore.GREEN}> Creating thumbnail for {filename}...{Fore.RESET}") vips_image = pyvips.Image.new_from_file(filename, access="sequential") # Set max height to 600px vips_thumbnail = vips_image.thumbnail_image(600) @@ -154,20 +152,12 @@ def download_bitstream(row): # check if the filename and URL fields specified by the user exist in the CSV if args.filename_field_name not in reader.fieldnames: sys.stderr.write( - Fore.RED - + 'Specified field "{}" does not exist in the CSV.\n'.format( - args.filename_field_name - ) - + Fore.RESET + f"{Fore.RED}Specified field '{args.filename_field_name}' does not exist in the CSV.\n{Fore.RESET}" ) sys.exit(1) if args.url_field_name not in reader.fieldnames: sys.stderr.write( - Fore.RED - + 'Specified field "{0}" does not exist in the CSV.\n'.format( - args.url_field_name - ) - + Fore.RESET + f"{Fore.RED}Specified field '{args.url_field_name}' does not exist in the CSV.\n{Fore.RESET}" ) sys.exit(1) From cf9077939582f9bc4388d9fdd35bb8e71101c83b Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Mar 2024 16:58:22 +0300 Subject: [PATCH 107/119] ilri/generate_thumbnails.py: run black --- ilri/generate_thumbnails.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py index aa77f7a553b1..30791d134c46 100755 --- a/ilri/generate_thumbnails.py +++ b/ilri/generate_thumbnails.py @@ -161,7 +161,11 @@ def download_bitstream(row): ) sys.exit(1) - rows_to_process = [row for row in reader if row[args.url_field_name] and row[args.filename_field_name]] + rows_to_process = [ + row + for row in reader + if row[args.url_field_name] and row[args.filename_field_name] + ] for row in rows_to_process: download_bitstream(row) From 40f4d3334ceae214aca08b4a46efcefafb6344ca Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 8 Mar 2024 16:59:02 +0300 Subject: [PATCH 108/119] ilri/generate_thumbnails.py: bump version --- ilri/generate_thumbnails.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py index 30791d134c46..922d3f06967c 100755 --- a/ilri/generate_thumbnails.py +++ b/ilri/generate_thumbnails.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# generate-thumbnails.py 1.1.3 +# generate-thumbnails.py 1.1.4 # # Copyright Alan Orth. # From c10a1cc05653006f42103358cde8c331a51c79b6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 19 Mar 2024 15:54:16 +0300 Subject: [PATCH 109/119] dspace/config: update ORCID identifiers IWMI sent me some new ones so I updated our list. This time I used GNU `sort` instead of `sort i` in vim because GNU sort handles ac- cents better. Formatted with tidy: $ tidy -xml -utf8 -m -iq -w 0 dspace/config/controlled-vocabularies/cg-creator-identifier.xml --- .../cg-creator-identifier.xml | 11301 ++++++++-------- 1 file changed, 5664 insertions(+), 5637 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index d49d917e362b..3b9dd75637f3 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml @@ -1,267 +1,267 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -273,8 +273,8 @@ - - + + @@ -284,5393 +284,5420 @@ - - - + + + - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From c3202fe453140592eb975dbd7058b2285fd6a2fa Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 22 Mar 2024 13:26:10 +0300 Subject: [PATCH 110/119] dspace/config: update authors --- dspace/config/controlled-vocabularies/dc-contributor-author.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml index 257f997a0a88..0bcd7be8a6ad 100644 --- a/dspace/config/controlled-vocabularies/dc-contributor-author.xml +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xml @@ -880,7 +880,6 @@ - @@ -1656,6 +1655,7 @@ + From c3c00ef8835f7e13bc7e625d11a6ddec2f459b8d Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 4 Apr 2024 12:22:31 +0300 Subject: [PATCH 111/119] dspace/config: update IFPRI project identifiers --- .../cg-identifier-project.xml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-identifier-project.xml b/dspace/config/controlled-vocabularies/cg-identifier-project.xml index 1fefb08eced1..71ff8802fd97 100644 --- a/dspace/config/controlled-vocabularies/cg-identifier-project.xml +++ b/dspace/config/controlled-vocabularies/cg-identifier-project.xml @@ -118,11 +118,12 @@ - - - - - - + + + + + + + From baaca131c8558f0f92c604bfc37b6c74b5613243 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sat, 6 Apr 2024 10:23:37 +0300 Subject: [PATCH 112/119] dspace/config/emails: update subscription message As of DSpace 7.6.1 the community and collection subscription emails say "changed" items, which is confusing for users. Overriding this for now. --- dspace/config/emails/subscriptions_content | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/emails/subscriptions_content b/dspace/config/emails/subscriptions_content index 9b8c91e559df..a1550f012be7 100644 --- a/dspace/config/emails/subscriptions_content +++ b/dspace/config/emails/subscriptions_content @@ -8,11 +8,11 @@ This email is sent from ${config.get('dspace.name')} based on the chosen subscri #if( not( "$params[0]" == "" )) Community Subscriptions: ------------------------ -List of changed items : ${params[0]} +List of new/changed items : ${params[0]} #end #if( not( "$params[1]" == "" )) Collection Subscriptions: ------------------------- -List of changed items : ${params[1]} +List of new/changed items : ${params[1]} #end From 038ac112bb09d03b2ebacf362d9899bb3b659987 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 19 Apr 2024 13:09:16 +0300 Subject: [PATCH 113/119] dspace/config: update ORCID identifiers Sorted first with GNU `sort` instead of `sort i` in vim because GNU sort handles accents better. Formatted with tidy: $ tidy -xml -utf8 -m -iq -w 0 dspace/config/controlled-vocabularies/cg-creator-identifier.xml --- .../cg-creator-identifier.xml | 1654 ++++++++--------- 1 file changed, 827 insertions(+), 827 deletions(-) diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml index 3b9dd75637f3..549a7d02c5ab 100644 --- a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml @@ -1411,7 +1411,7 @@ - + @@ -1787,7 +1787,7 @@ - + @@ -1968,7 +1968,7 @@ - + @@ -2338,10 +2338,10 @@ - - - - + + + + @@ -2700,7 +2700,7 @@ - + @@ -2939,7 +2939,7 @@ - + @@ -3030,7 +3030,7 @@ - + @@ -3067,7 +3067,7 @@ - + @@ -3521,822 +3521,822 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From f96b889d9fb3f07c1c8ddcdeff65916b98fcb612 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 23 Apr 2024 10:35:25 +0300 Subject: [PATCH 114/119] dspace/config: comment out default curation tasks We don't need the NOOP and Register DOI curation tasks. --- dspace/config/modules/curate.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dspace/config/modules/curate.cfg b/dspace/config/modules/curate.cfg index df6f1d17c572..21ccdb653f39 100644 --- a/dspace/config/modules/curate.cfg +++ b/dspace/config/modules/curate.cfg @@ -8,13 +8,13 @@ # NOTE: Other configurations can append to this list of default tasks by simply # adding their own additional values of "plugin.named.org.dspace.curate.CurationTask" -plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.NoOpCurationTask = noop +#plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.NoOpCurationTask = noop plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.ProfileFormats = profileformats plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RequiredMetadata = requiredmetadata #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.ClamScan = vscan #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.MicrosoftTranslator = translate plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.MetadataValueLinkChecker = checklinks -plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RegisterDOI = registerdoi +#plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RegisterDOI = registerdoi #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.CitationPage = citationpage # add new tasks here (or in additional config files) plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger From e2a96b37f91eeb2d3e143f07ba7ff2322a3506d4 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 23 Apr 2024 14:39:50 +0300 Subject: [PATCH 115/119] dspace/modules/additions: use cgspace-java-helpers v7.6.1.1 --- dspace/modules/additions/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/modules/additions/pom.xml b/dspace/modules/additions/pom.xml index 7dc205018c41..a3d55da3c64e 100644 --- a/dspace/modules/additions/pom.xml +++ b/dspace/modules/additions/pom.xml @@ -289,7 +289,7 @@ io.github.ilri.cgspace cgspace-java-helpers - 7.6.1-SNAPSHOT + 7.6.1.1-SNAPSHOT From 34a248cbcc03db222afb16d333e1bd25f33ea6af Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 23 Apr 2024 14:40:41 +0300 Subject: [PATCH 116/119] dspace/config/modules: enable normalizedois curation task --- dspace/config/modules/curate.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dspace/config/modules/curate.cfg b/dspace/config/modules/curate.cfg index 21ccdb653f39..2db0494e71c2 100644 --- a/dspace/config/modules/curate.cfg +++ b/dspace/config/modules/curate.cfg @@ -20,6 +20,8 @@ plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.MetadataV plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger.force +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.NormalizeDOIs = normalizedois + ## task queue implementation plugin.single.org.dspace.curate.TaskQueue = org.dspace.curate.FileTaskQueue From 096f4393e3d26278fad6ad64c9eea7c4f583ce92 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 23 Apr 2024 14:41:03 +0300 Subject: [PATCH 117/119] dspace/config: Enable workflow curation tasks Enable the normalizedois and countrycodetagger curation tasks to run on workflow items. The normalizedois task runs on `editstep`, which is immediately after an item is deposited into the workflow, and for every edit step after (so up to three times depending on how the collection is configurated). The countrycodetagger task runs on `finaleditstep`, which is the last edit step (aka "Final editors" or step 3) because we don't want editors to potentially remove the country code tags. The docs allude to an "archive" step that runs just before an item is inst- alled to the archive, but it doesn't seem to work. --- dspace/config/workflow-curation.xml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/dspace/config/workflow-curation.xml b/dspace/config/workflow-curation.xml index 89cfd5309f2a..24cdb98d32d4 100644 --- a/dspace/config/workflow-curation.xml +++ b/dspace/config/workflow-curation.xml @@ -9,7 +9,7 @@ - + @@ -62,7 +62,20 @@ - + + + + + + + + + + + + + + From e3ad5f5dcf9439861c446238141eae6385d4227b Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 25 Apr 2024 13:00:36 +0300 Subject: [PATCH 118/119] dspace/module/additions: use v7.6.1.2 of cgspace-java-helpers --- dspace/modules/additions/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace/modules/additions/pom.xml b/dspace/modules/additions/pom.xml index a3d55da3c64e..71a918cf1a70 100644 --- a/dspace/modules/additions/pom.xml +++ b/dspace/modules/additions/pom.xml @@ -289,7 +289,7 @@ io.github.ilri.cgspace cgspace-java-helpers - 7.6.1.1-SNAPSHOT + 7.6.1.2-SNAPSHOT From 257e2f9f620df8225944233d3e47b20c2a6cb0cd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 1 May 2024 16:42:58 +0000 Subject: [PATCH 119/119] build(deps): bump org.xmlunit:xmlunit-core in /dspace-api Bumps [org.xmlunit:xmlunit-core](https://github.com/xmlunit/xmlunit) from 2.9.1 to 2.10.0. - [Release notes](https://github.com/xmlunit/xmlunit/releases) - [Changelog](https://github.com/xmlunit/xmlunit/blob/main/RELEASE_NOTES.md) - [Commits](https://github.com/xmlunit/xmlunit/compare/v2.9.1...v2.10.0) --- updated-dependencies: - dependency-name: org.xmlunit:xmlunit-core dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- dspace-api/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace-api/pom.xml b/dspace-api/pom.xml index 5b578fa49d5a..4c881cbd2465 100644 --- a/dspace-api/pom.xml +++ b/dspace-api/pom.xml @@ -900,7 +900,7 @@ org.xmlunit xmlunit-core - 2.9.1 + 2.10.0 test