diff --git a/dspace-api/pom.xml b/dspace-api/pom.xml index 5b578fa49d5a..4c881cbd2465 100644 --- a/dspace-api/pom.xml +++ b/dspace-api/pom.xml @@ -900,7 +900,7 @@ org.xmlunit xmlunit-core - 2.9.1 + 2.10.0 test diff --git a/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java b/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java index 615aacac21cc..7c7dbc458c4d 100644 --- a/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java +++ b/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java @@ -443,7 +443,7 @@ public Response addItemMetadata(@PathParam("item_id") String itemId, @POST @Path("/{item_id}/bitstreams") public Bitstream addItemBitstream(@PathParam("item_id") String itemId, InputStream inputStream, - @QueryParam("name") String name, @QueryParam("description") String description, + @QueryParam("name") String name, @QueryParam("description") String description, @QueryParam("bundleName") String bundleName, @QueryParam("groupId") String groupId, @QueryParam("year") Integer year, @QueryParam("month") Integer month, @QueryParam("day") Integer day, @QueryParam("userIP") String user_ip, @@ -467,15 +467,29 @@ public Bitstream addItemBitstream(@PathParam("item_id") String itemId, InputStre log.trace("Creating bitstream in item."); org.dspace.content.Bundle bundle = null; org.dspace.content.Bitstream dspaceBitstream = null; - List bundles = itemService.getBundles(dspaceItem, org.dspace.core.Constants.CONTENT_BUNDLE_NAME); + List bundles = dspaceItem.getBundles(); + + // add bitstream to specified bundle + if (bundleName == null) { + bundleName = "ORIGINAL"; + } + for (Bundle existingBundle : bundles) + { + if (existingBundle.getName().equals(bundleName)) + { + bundle = existingBundle; + break; + } + } if (bundles != null && bundles.size() != 0) { bundle = bundles.get(0); // There should be only one bundle ORIGINAL. } if (bundle == null) { - log.trace("Creating bundle in item."); - dspaceBitstream = itemService.createSingleBitstream(context, inputStream, dspaceItem); + log.trace("Creating bundle "+bundleName+" in item."); + dspaceBitstream = itemService.createSingleBitstream(context, inputStream, dspaceItem, bundleName); } else { + log.trace("Getting bundle from item."); dspaceBitstream = bitstreamService.create(context, bundle, inputStream); } diff --git a/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xml new file mode 100644 index 000000000000..522530f8a9d8 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-contributor-affiliation.xmldiff --git a/dspace/config/controlled-vocabularies/cg-contributor-donor.xml b/dspace/config/controlled-vocabularies/cg-contributor-donor.xml new file mode 100644 index 000000000000..7bcedd05112e --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-contributor-donor.xmldiff --git a/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml b/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml new file mode 100644 index 000000000000..164d8bd35982 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-coverage-subregion.xml @@ -0,0 +1,87 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/controlled-vocabularies/cg-creator-identifier.xml b/dspace/config/controlled-vocabularies/cg-creator-identifier.xml new file mode 100644 index 000000000000..549a7d02c5ab --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-creator-identifier.xmldiff --git a/dspace/config/controlled-vocabularies/cg-identifier-project.xml b/dspace/config/controlled-vocabularies/cg-identifier-project.xml new file mode 100644 index 000000000000..71ff8802fd97 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-identifier-project.xml @@ -0,0 +1,129 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/controlled-vocabularies/cg-journal.xml b/dspace/config/controlled-vocabularies/cg-journal.xml new file mode 100644 index 000000000000..03403765acc1 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-journal.xmldiff --git a/dspace/config/controlled-vocabularies/cg-species-breed.xml b/dspace/config/controlled-vocabularies/cg-species-breed.xml new file mode 100644 index 000000000000..d1844dd7c727 --- /dev/null +++ b/dspace/config/controlled-vocabularies/cg-species-breed.xmldiff --git a/dspace/config/controlled-vocabularies/dc-contributor-author.xml b/dspace/config/controlled-vocabularies/dc-contributor-author.xml new file mode 100644 index 000000000000..0bcd7be8a6ad --- /dev/null +++ b/dspace/config/controlled-vocabularies/dc-contributor-author.xmldiff --git a/dspace/config/controlled-vocabularies/dcterms-publisher.xml b/dspace/config/controlled-vocabularies/dcterms-publisher.xml new file mode 100644 index 000000000000..d35c3fc29442 --- /dev/null +++ b/dspace/config/controlled-vocabularies/dcterms-publisher.xml @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dspace/config/controlled-vocabularies/dcterms-subject.xml b/dspace/config/controlled-vocabularies/dcterms-subject.xml new file mode 100644 index 000000000000..03fb2120a81e --- /dev/null +++ b/dspace/config/controlled-vocabularies/dcterms-subject.xmldiff --git a/dspace/config/crosswalks/google-metadata.properties b/dspace/config/crosswalks/google-metadata.properties index 157ee9c0b13c..60843c2eae77 100644 --- a/dspace/config/crosswalks/google-metadata.properties +++ b/dspace/config/crosswalks/google-metadata.properties @@ -11,9 +11,9 @@ # e.g. a dissertation item that contains values for the # dissertation-specific metadata elements. -google.identifiers.dissertation = dc.type:Thesis -google.identifiers.patent = dc.type:Patent -google.identifiers.technical_report = dc.type:Technical Report +google.identifiers.dissertation = dcterms.type:Thesis +google.identifiers.patent = dcterms.type:Patent +google.identifiers.technical_report = dcterms.type:Technical Report # Field Mappings @@ -37,31 +37,31 @@ google.identifiers.technical_report = dc.type:Technical Report # "$simple-pdf" inserts the full URL to the bitstream when there is only one and it is a PDF google.citation_title = dc.title -google.citation_publisher = dc.publisher +google.citation_publisher = dcterms.publisher google.citation_author = dc.author | dc.contributor.author | dc.creator -google.citation_date = dc.date.issued -google.citation_language = dc.language.iso +google.citation_date = dc.date.copyright | dcterms.issued | dc.date.available | dc.date.accessioned +google.citation_language = dcterms.language google.citation_pmid = google.citation_abstract_html_url = $handle google.citation_fulltext_html_url = google.citation_pdf_url = $simple-pdf -google.citation_keywords = dc.subject, dc.type +google.citation_keywords = dcterms.subject, dcterms.type -google.citation_journal_title = -google.citation_volume = -google.citation_issue = +google.citation_journal_title = cg.journal +google.citation_volume = cg.volume +google.citation_issue = cg.issue google.citation_firstpage = google.citation_lastpage = -google.citation_doi = -google.citation_issn = dc.identifier.issn -google.citation_isbn = dc.identifier.isbn +google.citation_doi = cg.identifier.doi +google.citation_issn = cg.issn +google.citation_isbn = cg.isbn google.citation_conference = # Type-specific fields retrieved when one of the above identifiers # is matched for the item. google.citation_dissertation_name = dc.title -google.citation_dissertation_institution = dc.publisher +google.citation_dissertation_institution = dcterms.publisher # Patent country for patent items; needs to be encoded as # a list of ISO 3166-1 alpha-3 codes per @@ -71,7 +71,7 @@ google.citation_patent_country = google.citation_patent_number = google.citation_technical_report_number = -google.citation_technical_report_institution = dc.publisher +google.citation_technical_report_institution = dcterms.publisher #priority "allow list" for citation_pdf_url, shortnames are defined in dspace/config/registries/bitstream-formats.xml #priority order is defined here, where the first type is the most important diff --git a/dspace/config/crosswalks/oai/metadataFormats/mods.xsl b/dspace/config/crosswalks/oai/metadataFormats/mods.xsl index f1a97ca6177e..9f781e39ef05 100644 --- a/dspace/config/crosswalks/oai/metadataFormats/mods.xsl +++ b/dspace/config/crosswalks/oai/metadataFormats/mods.xsl @@ -6,68 +6,214 @@ - + - - - - - - - - - - + + + + Author + aut + + + + + + + + + + Funder + fnd + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + + + + + + + + + + + + + + <xsl:value-of select="doc:metadata/doc:element[@name='cg']/doc:element[@name='journal']/doc:element/doc:field[@name='value']/text()"></xsl:value-of> + + + + + + + + + + + + + + + + + no. + + + + + + + + + + + + + + + + + + - - - - - - + + + + + - - + + + - + - + + + + + + + - - + + + + + - - - - + + + + - - + + + + + + + + - - + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + <xsl:value-of select="." /> + + + + + + + <xsl:value-of select="." /> + - - + + + - + diff --git a/dspace/config/crosswalks/oai/xoai.xml b/dspace/config/crosswalks/oai/xoai.xml index e843814dd7b8..cff70b03a048 100644 --- a/dspace/config/crosswalks/oai/xoai.xml +++ b/dspace/config/crosswalks/oai/xoai.xml @@ -91,6 +91,25 @@ + + + + + + + + + + + + + + + + + This contexts complies with AGRIS Guidelines. + + @@ -149,7 +168,7 @@ mods metadataFormats/mods.xsl http://www.loc.gov/mods/v3 - http://www.loc.gov/standards/mods/v3/mods-3-1.xsd + https://www.loc.gov/standards/mods/v3/mods-3-7.xsd qdc @@ -399,6 +418,30 @@ + + + + + + + + + + + + + + + + + + + + + @@ -516,6 +559,30 @@ + + + org.dspace.xoai.filter.DSpaceAtLeastOneMetadataFilter + + dcterms.type + equal + + Book + Book Chapter + Brief + Conference Paper + Conference Proceedings + Dataset + Journal Article + Manual + Report + Thesis + Training Material + Working Paper + + + + @@ -524,6 +591,11 @@ Open Access DRIVERset + + agris + AGRIS + + openaire OpenAIRE diff --git a/dspace/config/crosswalks/xhtml-head-item.properties b/dspace/config/crosswalks/xhtml-head-item.properties index f7ba355fd5a5..8206a23636ae 100644 --- a/dspace/config/crosswalks/xhtml-head-item.properties +++ b/dspace/config/crosswalks/xhtml-head-item.properties @@ -7,6 +7,7 @@ schema.DC = http://purl.org/dc/elements/1.1/ schema.DCTERMS = http://purl.org/dc/terms/ +schema.CG = https://agriculturalsemantics.github.io/cg-core/cgcore.html ####### Metadata field mappings ####### @@ -38,23 +39,23 @@ dc.date.accessioned = DCTERMS.dateAccepted,DCTERMS.W3CDTF dc.date.available = DCTERMS.available,DCTERMS.W3CDTF dc.date.copyright = DCTERMS.dateCopyrighted,DCTERMS.W3CDTF dc.date.created = DCTERMS.created,DCTERMS.W3CDTF -dc.date.issued = DCTERMS.issued,DCTERMS.W3CDTF +dcterms.issued = DCTERMS.issued,DCTERMS.W3CDTF dc.identifier = DC.identifier -dc.identifier.citation = DCTERMS.bibliographicCitation +dcterms.bibliographicCitation = DCTERMS.bibliographicCitation dc.identifier.uri = DC.identifier,DCTERMS.URI -dc.description = DC.description -dc.description.abstract = DCTERMS.abstract +dcterms.description = DC.description +dcterms.abstract = DCTERMS.abstract dc.description.tableofcontents = DCTERMS.tableOfContents dc.description.uri = DC.description,DCTERMS.URI dc.format = DC.format -dc.format.extent = DCTERMS.extent +dcterms.extent = DCTERMS.extent dc.format.medium = DCTERMS.medium dc.language = DC.language -dc.language.iso = DC.language,DCTERMS.RFC1766 -dc.publisher = DC.publisher -dc.relation = DC.relation +dcterms.language = DC.language,DCTERMS.RFC1766 +dcterms.publisher = DC.publisher +dcterms.relation = DC.relation dc.relation.isformatof = DCTERMS.isFormatOf -dc.relation.ispartof = DCTERMS.isPartOf +dcterms.isPartOf = DCTERMS.isPartOf dc.relation.haspart = DCTERMS.hasPart dc.relation.isversionof = DCTERMS.isVersionOf dc.relation.hasversion = DCTERMS.hasVersion @@ -65,13 +66,15 @@ dc.relation.isreplacedby = DCTERMS.isReplacedBy dc.relation.uri = DC.relation,DCTERMS.URI dc.rights = DC.rights dc.rights.uri = DC.rights,DCTERMS.URI +dcterms.license = DCTERMS.license dc.source = DC.source dc.source.uri = DC.source,DCTERMS.URI -dc.subject = DC.subject +dcterms.subject = DC.subject dc.subject.ddc = DC.subject,DCTERMS.DDC dc.subject.lcc = DC.subject,DCTERMS.LCC dc.subject.lcsh = DC.subject,DCTERMS.LCSH dc.subject.mesh = DC.subject,DCTERMS.MESH dc.title = DC.title dc.title.alternative = DCTERMS.alternative -dc.type = DC.type +dcterms.type = DCTERMS.type +dcterms.accessRights = DCTERMS.accessRights diff --git a/dspace/config/default.license b/dspace/config/default.license index 390e9786688d..57f743aca7cc 100644 --- a/dspace/config/default.license +++ b/dspace/config/default.license @@ -1,18 +1,13 @@ -NOTE: PLACE YOUR OWN LICENSE HERE -This sample license is provided for informational purposes only. +By signing and submitting this license, you (the author(s) or copyright owner) grants to the curators of CGSpace (see https://cgspace.cgiar.org/page/about) the non-exclusive right to reproduce, translate (as defined below), and/or distribute your submission (including the abstract) worldwide in print and electronic format and in any medium, including but not limited to audio or video. -NON-EXCLUSIVE DISTRIBUTION LICENSE +You agree that ILRI (on behalf of the CGSpace group) may, without changing the content, translate the submission to any medium or format for the purpose of preservation. -By signing and submitting this license, you (the author(s) or copyright owner) grants to DSpace University (DSU) the non-exclusive right to reproduce, translate (as defined below), and/or distribute your submission (including the abstract) worldwide in print and electronic format and in any medium, including but not limited to audio or video. +You also agree that ILRI (on behalf of the CGSpace group) may keep more than one copy of this submission for purposes of security, back-up and preservation. -You agree that DSU may, without changing the content, translate the submission to any medium or format for the purpose of preservation. +You represent that the submission is your original work, and that you have the right to grant these rights. You also represent that your submission does not, to the best of your knowledge, infringe upon anyone's copyright. -You also agree that DSU may keep more than one copy of this submission for purposes of security, back-up and preservation. +If the submission contains material for which you do not hold copyright, you represent that you have obtained the unrestricted permission of the copyright owner to grant ILRI (on behalf of the CGSpace group) the rights required by this license, and that such third-party owned material is clearly identified and acknowledged within the text or content of the submission. -You represent that the submission is your original work, and that you have the right to grant the rights contained in this license. You also represent that your submission does not, to the best of your knowledge, infringe upon anyone's copyright. +IF THE SUBMISSION IS BASED UPON WORK THAT HAS BEEN SPONSORED OR SUPPORTED BY AN AGENCY OR ORGANIZATION OTHER THAN ILRI OR THE CGSPACE GROUP, YOU REPRESENT THAT YOU HAVE FULFILLED ANY RIGHT OF REVIEW OR OTHER OBLIGATIONS REQUIRED BY SUCH CONTRACT OR AGREEMENT. -If the submission contains material for which you do not hold copyright, you represent that you have obtained the unrestricted permission of the copyright owner to grant DSU the rights required by this license, and that such third-party owned material is clearly identified and acknowledged within the text or content of the submission. - -IF THE SUBMISSION IS BASED UPON WORK THAT HAS BEEN SPONSORED OR SUPPORTED BY AN AGENCY OR ORGANIZATION OTHER THAN DSU, YOU REPRESENT THAT YOU HAVE FULFILLED ANY RIGHT OF REVIEW OR OTHER OBLIGATIONS REQUIRED BY SUCH CONTRACT OR AGREEMENT. - -DSU will clearly identify your name(s) as the author(s) or owner(s) of the submission, and will not make any alteration, other than as allowed by this license, to your submission. +ILRI (on behalf of the CGSpace group) will clearly identify your name(s) as the author(s) or owner(s) of the submission, and will not make any alteration, other than as allowed by this license, to your submission. diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index b7cc13e508dc..e7edc5965540 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -198,6 +198,8 @@ mail.charset = UTF-8 # mail.smtp.socketFactory.class=javax.net.ssl.SSLSocketFactory, \ # mail.smtp.socketFactory.fallback=false +mail.extraproperties = mail.smtp.starttls.enable=true, mail.smtp.ssl.protocols=TLSv1.2 + # An option is added to disable the mailserver. By default, this property is set to false # By setting mail.server.disabled = true, DSpace will not send out emails. # It will instead log the subject of the email which should have been sent @@ -311,7 +313,7 @@ handle.canonical.prefix = ${dspace.ui.url}/handle/ # CNRI Handle prefix # (Defaults to a dummy/fake prefix of 123456789) -handle.prefix = 123456789 +handle.prefix = 10568 # Directory for installing Handle server files handle.dir = ${dspace.dir}/handle-server @@ -319,7 +321,10 @@ handle.dir = ${dspace.dir}/handle-server # List any additional prefixes that need to be managed by this handle server # (as for examle handle prefix coming from old dspace repository merged in # that repository) -# handle.additional.prefixes = prefix1[, prefix2] +handle.additional.prefixes = 10947 + +# Allow DSpace to resolve handles from multiple prefixes (see HandlePlugin.java) +handle.plugin.checknameauthority = false # Whether to enable the DSpace handle resolver endpoints necessary for # https://github.com/DSpace/Remote-Handle-Resolver @@ -445,8 +450,8 @@ useProxies = true #Names of the enabled MediaFilter or FormatFilter plugins filter.plugins = Text Extractor -filter.plugins = JPEG Thumbnail -filter.plugins = PDFBox JPEG Thumbnail +filter.plugins = ImageMagick Image Thumbnail +filter.plugins = ImageMagick PDF Thumbnail # [To enable Branded Preview]: uncomment and insert the following into the plugin list @@ -550,7 +555,7 @@ filter.org.dspace.app.mediafilter.PDFBoxThumbnail.inputFormats = Adobe PDF # bnails. Greatly increases quality of resulting thumbnails, at the expense of # slightly longer execution times and higher memory usage. Any integer over 72 # will help, but recommend 144 for a "2x" supersample. -# org.dspace.app.mediafilter.ImageMagickThumbnailFilter.density = 144 +org.dspace.app.mediafilter.ImageMagickThumbnailFilter.density = 144 #### Crosswalk and Packager Plugin Settings #### # Crosswalks are used to translate external metadata formats into DSpace's internal format (DIM) @@ -780,7 +785,7 @@ event.dispatcher.default.class = org.dspace.event.BasicDispatcher # Add rdf here, if you are using dspace-rdf to export your repository content as RDF. # Add iiif here, if you are using dspace-iiif. # Add orcidqueue here, if the integration with ORCID is configured and wish to enable the synchronization queue functionality -event.dispatcher.default.consumers = versioning, discovery, eperson, submissionconfig +event.dispatcher.default.consumers = discovery, eperson, submissionconfig # The noindex dispatcher will not create search or browse indexes (useful for batch item imports) event.dispatcher.noindex.class = org.dspace.event.BasicDispatcher @@ -896,6 +901,9 @@ org.dspace.app.batchitemimport.work.dir = ${dspace.dir}/imports # default = false, (disabled) #org.dspace.content.Collection.findAuthorizedPerformanceOptimize = true +# For backwards compatibility, the subscription emails by default include any modified items +# uncomment the following entry for only new items to be emailed +eperson.subscription.onlynew = true # Identifier providers. # Following are configuration values for the EZID DOI provider, with appropriate @@ -940,7 +948,7 @@ registry.metadata.load = iiif-types.xml # can login as another user from the "edit eperson" page. This is useful for # debugging problems in a running dspace instance, especially in the workflow # process. The default value is false, i.e. no one may assume the login of another user. -#webui.user.assumelogin = true +webui.user.assumelogin = true # whether to display the contents of the licence bundle (often just the deposit # licence in standard DSpace installation) @@ -970,7 +978,7 @@ metadata.hide.person.email = true # Whether or not we REQUIRE that a file be uploaded # during the 'Upload' step in the submission process # Defaults to true; If set to 'false', submitter has option to skip upload -#webui.submit.upload.required = true +webui.submit.upload.required = false # Which field should be used for type-bind # Defaults to 'dc.type'; If changing this value, you must also update the related @@ -1023,8 +1031,8 @@ cc.license.locale = en # Maximum width and height (in pixels) of generated thumbnails # NOTE: In the UI's base theme, `--ds-thumbnail-max-width` defaults to 175px. # So, if you set 'thumbnail.maxwidth' >175, you may wish to modify that UI style variable as well. -thumbnail.maxwidth = 175 -thumbnail.maxheight = 175 +thumbnail.maxwidth = 500 +thumbnail.maxheight = 500 # Blur before scaling. A little blur before scaling does wonders for keeping # more in check. (Only used by JPEGFilter) @@ -1067,7 +1075,7 @@ webui.preview.brand.fontpoint = 12 # Whether to display collection and community strengths (i.e. item counts) # By default, this feature is disabled. -# webui.strengths.show = false +webui.strengths.show = true # Counts fetched in real time will perform an actual count of the # index contents every time a page with this feature is requested, @@ -1156,8 +1164,10 @@ webui.preview.brand.fontpoint = 12 webui.browse.index.1 = dateissued:item:dateissued webui.browse.index.2 = author:metadata:dc.contributor.*\,dc.creator:text webui.browse.index.3 = title:item:title -webui.browse.index.4 = subject:metadata:dc.subject.*:text -#webui.browse.index.5 = dateaccessioned:item:dateaccessioned +webui.browse.index.4 = subject:metadata:dc.subject.*\,dcterms.subject:text +webui.browse.index.5 = region:metadata:cg.coverage.region:text +webui.browse.index.6 = country:metadata:cg.coverage.country:text +webui.browse.index.7 = itemtype:metadata:dcterms.type:text ## example of authority-controlled browse category - see authority control config #webui.browse.index.5 = lcAuthor:metadataAuthority:dc.contributor.author:authority @@ -1166,7 +1176,8 @@ webui.browse.index.4 = subject:metadata:dc.subject.*:text # vocabularies in the submission forms. These could be disabled adding the name of # the vocabularies to exclude in this comma-separated property. # (Requires reboot of servlet container, e.g. Tomcat, to reload) -# webui.browse.vocabularies.disabled = srsc +webui.browse.vocabularies.disabled = srsc, dcterms-subject, dc-contributor-author, cg-contributor-donor, cg-contributor-affiliation, dcterms-publisher + # Enable/Disable tag cloud in browsing. # webui.browse.index.tagcloud. = true | false @@ -1198,8 +1209,9 @@ webui.browse.index.4 = subject:metadata:dc.subject.*:text # but otherwise don't want users to choose that option. # webui.itemlist.sort-option.1 = title:dc.title:title -webui.itemlist.sort-option.2 = dateissued:dc.date.issued:date +webui.itemlist.sort-option.2 = dateissued:dcterms.issued:date webui.itemlist.sort-option.3 = dateaccessioned:dc.date.accessioned:date +webui.itemlist.sort-option.4 = type:dcterms.type:text # Set the options for how the indexes are sorted # @@ -1238,6 +1250,7 @@ plugin.named.org.dspace.sort.OrderFormatDelegate= \ # # The default below defines the authors to link to other publications by that author webui.browse.link.1 = author:dc.contributor.* +webui.browse.link.2 = subject:dcterms.subject #### Display browse frequencies # @@ -1353,7 +1366,7 @@ websvc.opensearch.autolink = true websvc.opensearch.validity = 48 # short name used in browsers for search service # should be 16 or fewer characters -websvc.opensearch.shortname = DSpace +websvc.opensearch.shortname = CGSpace # longer (up to 48 characters) name websvc.opensearch.longname = ${dspace.name} # brief service description @@ -1464,7 +1477,7 @@ log.report.dir = ${dspace.dir}/log # You can add more than one 'mark_[value]' options (with different value) in case you need to mark items more than one time for # different purposes. Remember to add the respective beans in file 'config/spring/api/item-marking.xml'. # -# webui.itemlist.columns = dc.date.issued(date), dc.title, dc.contributor.* +webui.itemlist.columns = dcterms.accessRights,dcterms.issued(date),dcterms.type,dc.title,dc.contributor.*,cg.coverage.country,cg.coverage.region,cg.coverage.subregion # # Additionally, you can override the DC fields used on the listing page for # a given browse index and/or sort option. As a sort option or index may be defined @@ -1497,7 +1510,7 @@ log.report.dir = ${dspace.dir}/log # inside that snipet is your Google Analytics key usually found in this line: # _uacct = "UA-XXXXXXX-X" # Take this key (just the UA-XXXXXX-X part) and place it here in this parameter. -# google.analytics.key=UA-XXXXXX-X +google.analytics.key=UA-10691096-8 # The max number of events held in the GA buffer (default: 256) # google.analytics.buffer.limit=256 @@ -1532,7 +1545,7 @@ google-analytics.bundles = ORIGINAL # all - Anonymous users can request an item # logged - Login is mandatory to request an item # empty/commented out - request-copy not allowed -request.item.type = all +#request.item.type = all # Should all Request Copy emails go to the helpdesk instead of the item submitter? request.item.helpdesk.override = false # Should a rejection of a copy request send an email back to the requester? @@ -1544,7 +1557,7 @@ request.item.reject.email = true #------------------SUBMISSION CONFIGURATION------------------------# #------------------------------------------------------------------# # Field to use for type binding, default dc.type -submit.type-bind.field = dc.type +submit.type-bind.field = dcterms.type #---------------------------------------------------------------# #----------SOLR DATABASE RESYNC SCRIPT CONFIGURATION------------# @@ -1663,3 +1676,7 @@ include = ${module_dir}/usage-statistics.cfg include = ${module_dir}/versioning.cfg include = ${module_dir}/workflow.cfg include = ${module_dir}/external-providers.cfg + +# Configuration for CGSpace curation tasks +include = ${module_dir}/countrycodetagger.cfg +include = ${module_dir}/countrycodetagger.force.cfg diff --git a/dspace/config/emails/subscriptions_content b/dspace/config/emails/subscriptions_content index 9b8c91e559df..a1550f012be7 100644 --- a/dspace/config/emails/subscriptions_content +++ b/dspace/config/emails/subscriptions_content @@ -8,11 +8,11 @@ This email is sent from ${config.get('dspace.name')} based on the chosen subscri #if( not( "$params[0]" == "" )) Community Subscriptions: ------------------------ -List of changed items : ${params[0]} +List of new/changed items : ${params[0]} #end #if( not( "$params[1]" == "" )) Collection Subscriptions: ------------------------- -List of changed items : ${params[1]} +List of new/changed items : ${params[1]} #end diff --git a/dspace/config/item-submission.xml b/dspace/config/item-submission.xml index 1060a3303119..6abdc56e230c 100644 --- a/dspace/config/item-submission.xml +++ b/dspace/config/item-submission.xml @@ -109,6 +109,11 @@ org.dspace.app.rest.submit.step.DescribeStep submission-form + + submit.progressbar.describe.stepthree + org.dspace.app.rest.submit.step.DescribeStep + submission-form + submit.progressbar.describe.stepone org.dspace.app.rest.submit.step.DescribeStep @@ -261,12 +266,13 @@ + - + diff --git a/dspace/config/log4j2.xml b/dspace/config/log4j2.xml index 6e9a43e4f0fe..3e73a134e93d 100644 --- a/dspace/config/log4j2.xml +++ b/dspace/config/log4j2.xml @@ -97,6 +97,10 @@ + + + diff --git a/dspace/config/modules/authentication-ldap.cfg b/dspace/config/modules/authentication-ldap.cfg index bcc29ccac551..b2916b88e396 100644 --- a/dspace/config/modules/authentication-ldap.cfg +++ b/dspace/config/modules/authentication-ldap.cfg @@ -27,7 +27,7 @@ # With the setting off, users will be required to register and login with # their email address. With this setting on, users will be able to login # and register with their LDAP user ids and passwords. -authentication-ldap.enable = false +authentication-ldap.enable = true ##### LDAP AutoRegister Settings ##### @@ -51,7 +51,7 @@ authentication-ldap.autoregister = true # This is the unique identifier field in the LDAP directory # where the username is stored. -#authentication-ldap.id_field = uid +authentication-ldap.id_field = sAMAccountName # This is the object context used when authenticating the # user. It is appended to the id_field and username. @@ -72,31 +72,31 @@ authentication-ldap.autoregister = true # parameter. But again this depends on each individual LDAP server # configuration. # Note: Prepend commas with a backslash to escape them -#authentication-ldap.search_context = ou=people\,ou=faculties +authentication-ldap.search_context = dc=cgiarad\,dc=org # This is the LDAP object field where the user's email address # is stored. "mail" is the default and the most common for # LDAP servers. If the mail field is not found the username # will be used as the email address when creating the eperson # object. -#authentication-ldap.email_field = mail +authentication-ldap.email_field = mail # This is the LDAP object field where the user's last name is # stored. "sn" is the default and is the most common for LDAP # servers. If the field is not found the field will be left # blank in the new eperson object. -#authentication-ldap.surname_field = sn +authentication-ldap.surname_field = sn # This is the LDAP object field where the user's given names # are stored. This may not be used or set in all LDAP instances. # If the field is not found the field will be left blank in the # new eperson object. -#authentication-ldap.givenname_field = givenName +authentication-ldap.givenname_field = givenName # This is the field where the user's phone number is stored in # the LDAP directory. If the field is not found the field # will be left blank in the new eperson object. -#authentication-ldap.phone_field = telephoneNumber +authentication-ldap.phone_field = telephoneNumber ##### LDAP users group ##### @@ -104,8 +104,7 @@ authentication-ldap.autoregister = true # If required, a group name can be given here, and all users who log in # to LDAP will automatically become members of this group. This is useful # if you want a group made up of all internal authenticated users. -#authentication-ldap.login.specialgroup = group-name - +authentication-ldap.login.specialgroup = CGIAR_LDAP_USERS ##### Hierarchical LDAP Settings ##### @@ -128,7 +127,7 @@ authentication-ldap.autoregister = true # object scope : 0 # one level scope : 1 # subtree scope : 2 -#authentication-ldap.search_scope = 2 +authentication-ldap.search_scope = 2 # If true, the initial bind will be performed anonymously. #authentication-ldap.search.anonymous = false @@ -150,8 +149,7 @@ authentication-ldap.autoregister = true # in user's full DN. If it's found, assign user to the DSpace group # specified by the right part of the groupmap value (after the ":"). # One user may belong to multiple groups. -#authentication-ldap.login.groupmap.1 = ou=ldap-dept1:dspace-group1 -#authentication-ldap.login.groupmap.2 = ou=ldap-dept2:dspace-groupA +authentication-ldap.login.groupmap.1 = OU=ILRIHUB:ILRI_LDAP_USERS #authentication-ldap.login.groupmap.3 = ou=ldap-dept3:dspace-groupA # If this property is uncommented, it changes the meaning of the left part of diff --git a/dspace/config/modules/authentication.cfg b/dspace/config/modules/authentication.cfg index 568f871e3cd7..9868b42d6602 100644 --- a/dspace/config/modules/authentication.cfg +++ b/dspace/config/modules/authentication.cfg @@ -44,7 +44,7 @@ #plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.IPAuthentication # LDAP authentication/authorization. See authentication-ldap.cfg for default configuration. -#plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.LDAPAuthentication +plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.LDAPAuthentication # Shibboleth authentication/authorization. See authentication-shibboleth.cfg for default configuration. #plugin.sequence.org.dspace.authenticate.AuthenticationMethod = org.dspace.authenticate.ShibAuthentication diff --git a/dspace/config/modules/countrycodetagger.cfg b/dspace/config/modules/countrycodetagger.cfg new file mode 100644 index 000000000000..e074715fa731 --- /dev/null +++ b/dspace/config/modules/countrycodetagger.cfg @@ -0,0 +1,8 @@ +# name of the field containing ISO 3166-1 country names +countrycodetagger.iso3166.field = cg.coverage.country + +# name of the field containing ISO 3166-1 Alpha2 country codes +countrycodetagger.iso3166-alpha2.field = cg.coverage.iso3166-alpha2 + +# only add country codes if an item doesn't have any (default false) +#countrycodetagger.forceupdate = false diff --git a/dspace/config/modules/countrycodetagger.force.cfg b/dspace/config/modules/countrycodetagger.force.cfg new file mode 100644 index 000000000000..5425ecb94cbf --- /dev/null +++ b/dspace/config/modules/countrycodetagger.force.cfg @@ -0,0 +1,8 @@ +# name of the field containing ISO 3166-1 country names +countrycodetagger.force.iso3166.field = cg.coverage.country + +# name of the field containing ISO 3166-1 Alpha2 country codes +countrycodetagger.force.iso3166-alpha2.field = cg.coverage.iso3166-alpha2 + +# clear existing country codes and add new ones +countrycodetagger.force.forceupdate = true diff --git a/dspace/config/modules/curate.cfg b/dspace/config/modules/curate.cfg index 1d7b87960df1..2db0494e71c2 100644 --- a/dspace/config/modules/curate.cfg +++ b/dspace/config/modules/curate.cfg @@ -8,15 +8,19 @@ # NOTE: Other configurations can append to this list of default tasks by simply # adding their own additional values of "plugin.named.org.dspace.curate.CurationTask" -plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.NoOpCurationTask = noop +#plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.NoOpCurationTask = noop plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.ProfileFormats = profileformats plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RequiredMetadata = requiredmetadata #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.ClamScan = vscan #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.MicrosoftTranslator = translate plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.MetadataValueLinkChecker = checklinks -plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RegisterDOI = registerdoi +#plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.RegisterDOI = registerdoi #plugin.named.org.dspace.curate.CurationTask = org.dspace.ctask.general.CitationPage = citationpage # add new tasks here (or in additional config files) +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger.force + +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.NormalizeDOIs = normalizedois ## task queue implementation plugin.single.org.dspace.curate.TaskQueue = org.dspace.curate.FileTaskQueue diff --git a/dspace/config/modules/discovery.cfg b/dspace/config/modules/discovery.cfg index 72088ddc49fa..7699a34e313a 100644 --- a/dspace/config/modules/discovery.cfg +++ b/dspace/config/modules/discovery.cfg @@ -22,7 +22,7 @@ discovery.search.server = ${solr.server}/${solr.multicorePrefix}search # discovery.index.ignore-variants = false # discovery.index.ignore-authority = false -discovery.index.projection=dc.title,dc.contributor.*,dc.date.issued +discovery.index.projection=dc.title,dc.contributor.*,dcterms.issued # Allow auto-reindexing. # If any database migrations are applied to your database (via Flyway), then a @@ -31,7 +31,7 @@ discovery.index.projection=dc.title,dc.contributor.*,dc.date.issued # property is enabled AND that such a file exists. If the two conditions are # satisfied, a background reindex of all content is triggered in Discovery. # Defaults to true: auto-reindexing is enabled. -#discovery.autoReindex = true +discovery.autoReindex = false # Value used for the namedresourcetype facet used by the mydspace # \n|||\n### diff --git a/dspace/config/modules/rest.cfg b/dspace/config/modules/rest.cfg index 537eedbd087b..d62565da8dc4 100644 --- a/dspace/config/modules/rest.cfg +++ b/dspace/config/modules/rest.cfg @@ -8,7 +8,7 @@ # Defaults to ${dspace.ui.url} if unspecified (as the UI must have access to the REST API). # Multiple allowed origin URLs may be comma separated. Wildcard value (*) is NOT SUPPORTED. # (Requires reboot of servlet container, e.g. Tomcat, to reload) -rest.cors.allowed-origins = ${dspace.ui.url} +rest.cors.allowed-origins = ${dspace.ui.url}, http://localhost:4000 # Whether or not to allow credentials (e.g. cookies) sent by the client/browser in CORS # requests (in "Access-Control-Allow-Credentials" header). diff --git a/dspace/config/modules/solr-statistics.cfg b/dspace/config/modules/solr-statistics.cfg index 073850ca232e..f57132741a21 100644 --- a/dspace/config/modules/solr-statistics.cfg +++ b/dspace/config/modules/solr-statistics.cfg @@ -25,7 +25,7 @@ solr-statistics.configset = statistics # control solr statistics querying to look at "isBot" field to determine # if record is a bot. true by default. -#solr-statistics.query.filter.isBot = true +solr-statistics.query.filter.isBot = true # Whether or not explicit solr.commit can be done in SolrLoggerServiceImpl#postView, or to be left to the autocommit. # Defaults to true (i.e. via autoCommit, no explicit commits); set to false in statistics tests (e.g. StatisticsRestRepositoryIT) diff --git a/dspace/config/modules/usage-statistics.cfg b/dspace/config/modules/usage-statistics.cfg index c77bb1ca78a3..199992617776 100644 --- a/dspace/config/modules/usage-statistics.cfg +++ b/dspace/config/modules/usage-statistics.cfg @@ -35,11 +35,11 @@ usage-statistics.authorization.admin.workflow=true # If true, event will be logged with the 'isBot' field set to true # (see query.filter.* for query filter options) # Default value is true. -#usage-statistics.logBots = true +usage-statistics.logBots = false # Enable/disable if a matching for a bot should be case sensitive # Setting this value to true will increase cpu usage, but bots will be found more accurately -#usage-statistics.bots.case-insensitive = false +usage-statistics.bots.case-insensitive = true # Set to true if the statistics core is sharded into a core per year, defaults to false # If you are sharding your statistics index each year by running "dspace stats-util -s", you should set this to "true" diff --git a/dspace/config/modules/versioning.cfg b/dspace/config/modules/versioning.cfg index 1690ceac4cd2..9f8540c00edf 100644 --- a/dspace/config/modules/versioning.cfg +++ b/dspace/config/modules/versioning.cfg @@ -5,7 +5,7 @@ #---------------------------------------------------# # The property versioning.enabled is used to enabled/disable versioning in DSpace, # the default value is true if it unset -# versioning.enabled = true +versioning.enabled = false # Control if the history overview of an item should only be shown to administrators # If enabled only the administrators for the item will be able to view the versioning history diff --git a/dspace/config/spiders/agents/ilri b/dspace/config/spiders/agents/ilri new file mode 100644 index 000000000000..b89f6281bf97 --- /dev/null +++ b/dspace/config/spiders/agents/ilri @@ -0,0 +1,47 @@ +Delphi +RI\/1\.0 +PostmanRuntime +node-fetch +Photon +StatusCake_Pagespeed_indev +node-superagent +cortex +FlipboardProxy +GARDIAN +randint +scalaj-http +scpitspi-rs +lua-resty-http +AHC +acebookexternalhit +Iframely +qbhttp +^got +^colly +article-parser +1science +Moreover\/\d +Nutch-\d +Exploratodo\/\d +Crowsnest\/\d +metha\/\d +FullStoryBot +SomeRandomText +ubermetrics +curb +bitdiscovery +omgili +Vizzit +Java\/17-ea +AdobeUxTechC4-Async +ZaloPC-win32-24v473 +nbertaupete95 +Scoop\.it +WebAPIClient +RStudio +^MEL +GuzzleHttp +Owler +newspaperjs +^Chrome$ +curl diff --git a/dspace/config/spring/api/crossref-integration.xml b/dspace/config/spring/api/crossref-integration.xml index d1e416d2b0c6..290867068b4d 100644 --- a/dspace/config/spring/api/crossref-integration.xml +++ b/dspace/config/spring/api/crossref-integration.xml @@ -38,7 +38,7 @@ - + @@ -46,7 +46,7 @@ - + @@ -54,7 +54,7 @@ - + @@ -76,7 +76,7 @@ - + @@ -84,7 +84,7 @@ - + @@ -111,7 +111,7 @@ - + @@ -119,7 +119,7 @@ - + @@ -127,7 +127,7 @@ - + @@ -135,7 +135,7 @@ - + diff --git a/dspace/config/spring/api/datacite-integration.xml b/dspace/config/spring/api/datacite-integration.xml index 236ec0a3bda9..911d5bec3a29 100644 --- a/dspace/config/spring/api/datacite-integration.xml +++ b/dspace/config/spring/api/datacite-integration.xml @@ -56,7 +56,7 @@ - + diff --git a/dspace/config/spring/api/discovery.xml b/dspace/config/spring/api/discovery.xml index fb25f11598fa..b165c461bbba 100644 --- a/dspace/config/spring/api/discovery.xml +++ b/dspace/config/spring/api/discovery.xml @@ -163,10 +163,22 @@ - + - - + + + + + + + + + + + + + + @@ -176,17 +188,22 @@ - + - - - - - - - - - + + + + + + + + + + + + + + @@ -197,7 +214,9 @@ + + @@ -244,7 +263,7 @@ - + @@ -291,7 +310,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject @@ -345,7 +364,9 @@ + + @@ -393,7 +414,7 @@ - + @@ -428,7 +449,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject @@ -484,7 +505,9 @@ + + @@ -532,7 +555,7 @@ - + @@ -567,7 +590,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject @@ -626,7 +649,9 @@ + + @@ -672,7 +697,7 @@ - + @@ -707,7 +732,7 @@ dc.title dc.contributor.author dc.creator - dc.subject + dcterms.subject @@ -751,10 +776,11 @@ - + + @@ -782,7 +808,7 @@ - + @@ -829,6 +855,7 @@ + @@ -857,7 +884,7 @@ - + @@ -902,10 +929,11 @@ - + + @@ -933,7 +961,7 @@ - + @@ -981,6 +1009,7 @@ + @@ -1007,7 +1036,7 @@ - + @@ -1056,6 +1085,7 @@ + @@ -1082,7 +1112,7 @@ - + @@ -1140,7 +1170,9 @@ + + @@ -1210,7 +1242,9 @@ + + @@ -1274,6 +1308,7 @@ + @@ -1336,6 +1371,7 @@ + @@ -1509,6 +1545,7 @@ + @@ -1573,6 +1610,7 @@ + @@ -1634,6 +1672,7 @@ + @@ -1694,6 +1733,7 @@ + @@ -1754,6 +1794,7 @@ + @@ -1813,6 +1854,7 @@ + @@ -1873,6 +1915,7 @@ + @@ -1932,6 +1975,7 @@ + @@ -2004,6 +2048,7 @@ + @@ -2063,6 +2108,7 @@ + @@ -2320,17 +2366,16 @@ - + - dc.subject.* + dcterms.subject - @@ -2338,7 +2383,7 @@ - dc.date.issued + dcterms.issued @@ -2383,7 +2428,7 @@ - dc.type + dcterms.type @@ -2848,6 +2893,181 @@ + + + + + + cg.coverage.country + + + + + + + + + + + + + + + cg.coverage.region + + + + + + + + + + + + + + + dcterms.accessRights + + + + + + + + + + + + + + + dcterms.license + + + + + + + + + + + + + + + cg.contributor.initiative + + + + + + + + + + + + + + + cg.contributor.affiliation + + + + + + + + + + + + + + + cg.contributor.crp + + + + + + + + + + + + cg.contributor.donor + + + + + + + + + + + + cg.subject.sdg + + + + + + + + + + + + cg.subject.impactArea + + + + + + + + + + + + cg.subject.actionArea + + + + + + + + + + + + dcterms.publisher + + + + + + + + + + + + cg.subject.impactPlatform + + + + + + + @@ -2859,11 +3079,17 @@ - + - + + + + + + + @@ -2934,7 +3160,7 @@ - + diff --git a/dspace/config/spring/api/external-services.xml b/dspace/config/spring/api/external-services.xml index 6d7d50c39f1b..f3391ce35bda 100644 --- a/dspace/config/spring/api/external-services.xml +++ b/dspace/config/spring/api/external-services.xml @@ -107,7 +107,7 @@ - + Publication @@ -215,7 +215,7 @@ - + Publication diff --git a/dspace/config/spring/api/identifier-service.xml b/dspace/config/spring/api/identifier-service.xml index 79e19e879e8e..578ca9c00401 100644 --- a/dspace/config/spring/api/identifier-service.xml +++ b/dspace/config/spring/api/identifier-service.xml @@ -24,7 +24,7 @@ The VersionedHandleIdentifierProvider creates a new versioned handle for every new version. --> - +
+ + + dc + title + + false + + onebox + Enter the full official title of the item (including any subtitles, etc). + You must enter a main title for this item. + + + + + dcterms + alternative + + true + + onebox + Enter any alternative titles for the item (other languages, etc). + + + + + + dcterms + type + + false + + dropdown + Select the type of content of the item. + You must select item type. + + +
+ +
dc contributor author true - + + dc-contributor-author onebox - Enter the author's name (Family name, Given names). + Enter the names of individual or corporate authors of this item (enter each individually). + Please enter the author(s). + + + + + cg + creator + identifier + true + + cg-creator-identifier + onebox + Enter ORCID identifiers (for CGIAR authors at least). Click above to select from a pre-populated list of author ORCID identifiers. Enter one per author. If an identifier is missing, enter a new one in the same format (Name: 0000-0002-1735-7458). Use the exact name style the author uses at https://orcid.org - dc - title + cg + contributor + affiliation + true + + cg-contributor-affiliation + onebox + Enter the full name of the institutions that the author(s) work for (enter each individually). + + + + cg + contributor + donor + true + + onebox + cg-contributor-donor + Enter full institution name(s) who sponsored the item. Click above to select from a pre-populated list, or add manually. Outputs of CGIAR platforms and initiatives are always sponsored by 'CGIAR Trust Fund'. + + + + + + cg + authorship + types + true + + dropdown + Characterize the entire authorship based on author affiliations. + + + + + + cg + journal false - + onebox - Enter the main title of the item. - You must enter a main title for this item. + Enter the full journal title. + Journal Article,Data Paper + - dc - title - alternative - true - + cg + volume + + false + + onebox - If the item has any alternative titles, please enter them here. + Enter the journal volume. For example, if published in PLoS ONE 16(1), the volume is: 16 + Journal Article,Data Paper + + + + cg + issue + + false + + onebox + + Enter the journal issue. For example, if published in PLoS ONE 16(1), the issue number is: 1 + Journal Article,Data Paper - dc - date - issued + cg + edition + false - - + + + onebox + Enter the edition of the item (3rd, 17th, Revised). + + + + dcterms + extent + + false + + + onebox + Enter the page range for an article or chapter (p. 10-17); or the total pages for a report (120 p.). + + + + + + dcterms + issued + + false + date - Please give the date of previous publication or public distribution. - You can leave out the day and/or month if they aren't applicable. - + Enter the date when the item was officially published or made public. For journal articles, this is the date the article was included in a volume/issue, if known. If the issue date is not known, please use the online first date here. You must enter at least the year. - dc - publisher + dcterms + available false + + date + Enter the date when the item was available online. Use this to indicate a journal article was online first, and use the publication date for the issue date. + Journal Article,Data Paper + + + + + + dcterms + publisher + + true - + onebox - Enter the name of the publisher of the previously issued instance of this item. + Enter the full name of the publisher. Click to see a pre-populated list of common commercial and CGIAR publishers or enter a new one. + + dcterms-publisher + + + cg + identifier + publicationRank + false + + + dropdown + IFPRI publication rank. - dc - identifier - citation + dcterms + bibliographicCitation + false - + onebox - Enter the standard citation for the previously issued instance of this item. - + Enter the standard citation. + Standard citation is required. - dc - relation - ispartofseries - true + dcterms + isPartOf + + false + + + onebox + Enter the full official series/report name, for example CCAFS Info Note or ILRI Research Report. + + + + cg + number + + false - series - Enter the series and number assigned to this item by your community. + + onebox + Enter the series/report number. - dc - identifier + dcterms + abstract + + false + + textarea + Enter the abstract of the item. + + + + + + dcterms + accessRights + + false + + dropdown + Indicate the accessibility of the item. If the item is free to read this should be "Open Access", even if it is copyrighted. + Access rights is required. + + + dcterms + license + false + + dropdown + Indicate the usage rights of the item. For most items this will be a Creative Commons license. Look on the publisher's page or in the PDF itself if available. Choose "Other" if the item does not have a license or none is specified. + Usage rights is required. + + + + + cg + isijournal + + false + + Indicate if the item was published in an ISI journal. + dropdown + + Journal Article,Data Paper + + + + + cg + howPublished + + false + + Indicate whether the item was formally published or not. In general, any output other than journal articles and books are grey literature. + dropdown + + + + cg + reviewStatus + + false + + Indicate whether the item has undergone internal or peer review. + dropdown + + + + + + dcterms + audience - true - - qualdrop_value - If the item has any identification numbers or codes associated with - it, please enter the types and the actual numbers or codes. - + + Indicate the main audience for whom the item is produced. + dropdown - dc - type + cg + isbn + + false + + + onebox + Enter the ISBN for the item, for example: 978-3-16-148410-0 + + + + cg + issn + false + + + onebox + Enter the ISSN for the serial publication where this item appears, for example: 2049-3630 + + Journal Article,Data Paper + + + + + cg + identifier + url true - - dropdown - Select the type of content of the item. - + + + onebox + Enter an official external URL for this item. For example: a link to the website, blog post, dataset, etc itself. Do not enter a DOI here. + + + + cg + identifier + doi + false + + + onebox + If this item has a DOI, enter the full address here. For example: https://doi.org/10.1038/s41598-019-43406-0 - dc + dcterms language - iso - false + + true + dropdown Select the language of the main content of the item. If the language does not appear in the list, please select 'Other'. If the content does not really have a language (for example, if it - is a dataset or an image) please select 'N/A'. + is a dataset or an image) please leave this blank. + + cg + place + + false + + + onebox + NOT for journal articles. Enter the city and country, for example: Nairobi, Kenya + +
-
+ - dc + cg + contributor + initiative + true + + + dropdown + Select any CGIAR Initiatives(s) associated with this item. Use this to show that an Initiative funded this item. + + + + + + cg + subject + impactPlatform + true + + + dropdown + Select any CGIAR Impact Platforms associated with this item. + + + + + + cg + subject + impactArea + true + + dropdown + Select any CGIAR Impact Areas associated with this item. + + + + cg + subject + actionArea + true + + dropdown + Select any CGIAR Action Areas associated with this item. + + + + + + cg + subject + sdg + true + + dropdown + Select any UN Sustainable Development Goals associated with this item. + + + + + + cg + identifier + project + true + + + onebox + cg-identifier-project + Unique identifier for a project associated with this item. + + + + cg + contributor + crp + true + + + dropdown + Select any CGIAR Research Program(s) and Platform(s) associated with this item. Use this to show that a CRP funded this item. + + + + + + cg + coverage + country + true + + dropdown + Select a country or countries within the scope of the item. + + + + + + cg + coverage + region + true + + dropdown + Select a region or regions within the scope of the item. + + + + cg + coverage + subregion + true + + onebox + cg-coverage-subregion + Enter the subregion. Normally provinces, states, regions, etc WITHIN a country according to ISO 3166-2 subdivisions. + + + + + + cg + river + basin + true + + dropdown + Select the focus basin(s). + + + + + + dcterms subject - true - - tag - Enter appropriate subject keywords or phrases. + + onebox + Enter AGROVOC subjects in lower case. - srsc + dcterms-subject - dc - description - abstract - false - - textarea - Enter the abstract of the item. + cg + identifier + iitatheme + true + + dropdown + + + + + cg + subject + iita + true + + dropdown + - dc - description - sponsorship + cg + identifier + iwmilibrary false - - textarea - Enter the names of any sponsors and/or funding codes in the box. + + onebox + IWMI Library internal reference (eg H049940) - dc + cg + subject + alliancebiovciat + true + + dropdown + + + + + + + cg + subject + cip + true + + dropdown + + + + + + + cg + subject + ilri + true + + dropdown + + + + + + + dcterms description false - + textarea Enter any other description or comments in this box. + + + cg + link + video + false + + + onebox + Enter related video link (eg a video about this item). + + + + cg + link + audio + false + + + onebox + Enter related audio link (eg an audio about this item). + + + + cg + link + photo + false + + + onebox + Enter related photo link (eg a photo or image associated with this item). + + + + + + dcterms + relation + + true + + + onebox + Enter related reference link (normally a URL to another item). + + + + cg + identifier + dataurl + true + + + onebox + Enter URL for any associated data file(s), in a repository for example. + + + + + + cg + link + citation + false + + onebox + Enter related reference citation (normally a full citation, not a URL). + + + + + + cg + species + + true + + onebox + Enter scientific name of organism, plant, animal, etc if it is a main focus of the item, for example: Lablab purpureus, Bos Taurus, Oryza sativa, Theileria parva + + + + cg + species + breed + true + + onebox + cg-species-breed + + +
@@ -1083,7 +1628,6 @@ tag Enter appropriate subject keywords or phrases. - srsc @@ -1394,149 +1938,3650 @@ - Animation - Animation + Abstract + Abstract + + + Audio + Audio - Article - Article + Blog Post + Blog Post Book Book - Book chapter - Book chapter + Book Chapter + Book Chapter + + + Brief + Brief + + + Brochure + Brochure + + + Case Study + Case Study + + + Conference Paper + Conference Paper + + + Conference Proceedings + Conference Proceedings + + + Data Paper + Data Paper Dataset Dataset - Learning Object - Learning Object + Equation + Equation + + + Extension Material + Extension Material Image Image - Image, 3-D - Image, 3-D + Infographic + Infographic - Map - Map + Internal Document + Internal Document - Musical Score - Musical Score + Journal Article + Journal Article - Plan or blueprint - Plan or blueprint + Journal Item + Journal Item - Preprint - Preprint + Logo + Logo - Presentation - Presentation + Manual + Manual - Recording, acoustical - Recording, acoustical + Manuscript-unpublished + Manuscript-unpublished - Recording, musical - Recording, musical + Map + Map - Recording, oral - Recording, oral + News Item + News Item - Software - Software + Newsletter + Newsletter - Technical Report - Technical Report + Opinion Piece + Opinion Piece - Thesis - Thesis + Photo Report + Photo Report - Video - Video + Poster + Poster - Working Paper - Working Paper - - - Other - Other + Presentation + Presentation - - - - - N/A - + Press Item + Press Item - English (United States) - en_US + Report + Report - English - en + Software + Software - Spanish - es + Source Code + Source Code - German - de + Template + Template - French - fr + Thesis + Thesis - Italian - it + Training Material + Training Material - Japanese - ja + Video + Video - Chinese - zh + Website + Website - Portuguese - pt + Working Paper + Working Paper - Turkish - tr + Wiki + Wiki - (Other) - other + Other + Other + + + + English + en + + + French + fr + + + Spanish + es + + + Afar + aa + + + Abkhazian + ab + + + Afrikaans + af + + + Akan + ak + + + Amharic + am + + + Arabic + ar + + + Aragonese + an + + + Assamese + as + + + Avaric + av + + + Avestan + ae + + + Aymara + ay + + + Azerbaijani + az + + + Bashkir + ba + + + Bambara + bm + + + Belarusian + be + + + Bengali + bn + + + Bislama + bi + + + Tibetan + bo + + + Bosnian + bs + + + Breton + br + + + Bulgarian + bg + + + Catalan + ca + + + Czech + cs + + + Chamorro + ch + + + Chechen + ce + + + Church Slavic + cu + + + Chuvash + cv + + + Cornish + kw + + + Corsican + co + + + Cree + cr + + + Welsh + cy + + + Danish + da + + + German + de + + + Dhivehi + dv + + + Dzongkha + dz + + + Modern Greek (1453-) + el + + + Esperanto + eo + + + Estonian + et + + + Basque + eu + + + Ewe + ee + + + Faroese + fo + + + Persian + fa + + + Fijian + fj + + + Finnish + fi + + + Western Frisian + fy + + + Fulah + ff + + + Scottish Gaelic + gd + + + Irish + ga + + + Galician + gl + + + Manx + gv + + + Guarani + gn + + + Gujarati + gu + + + Haitian + ht + + + Hausa + ha + + + Serbo-Croatian + sh + + + Hebrew + he + + + Herero + hz + + + Hindi + hi + + + Hiri Motu + ho + + + Croatian + hr + + + Hungarian + hu + + + Armenian + hy + + + Igbo + ig + + + Ido + io + + + Sichuan Yi + ii + + + Inuktitut + iu + + + Interlingue + ie + + + Interlingua (International Auxiliary Language Association) + ia + + + Indonesian + id + + + Inupiaq + ik + + + Icelandic + is + + + Italian + it + + + Javanese + jv + + + Japanese + ja + + + Kalaallisut + kl + + + Kannada + kn + + + Kashmiri + ks + + + Georgian + ka + + + Kanuri + kr + + + Kazakh + kk + + + Central Khmer + km + + + Kikuyu + ki + + + Kinyarwanda + rw + + + Kirghiz + ky + + + Komi + kv + + + Kongo + kg + + + Korean + ko + + + Kuanyama + kj + + + Kurdish + ku + + + Lao + lo + + + Latin + la + + + Latvian + lv + + + Limburgan + li + + + Lingala + ln + + + Lithuanian + lt + + + Luxembourgish + lb + + + Luba-Katanga + lu + + + Ganda + lg + + + Marshallese + mh + + + Malayalam + ml + + + Marathi + mr + + + Macedonian + mk + + + Malagasy + mg + + + Maltese + mt + + + Mongolian + mn + + + Maori + mi + + + Malay (macrolanguage) + ms + + + Burmese + my + + + Nauru + na + + + Navajo + nv + + + South Ndebele + nr + + + North Ndebele + nd + + + Ndonga + ng + + + Nepali (macrolanguage) + ne + + + Dutch + nl + + + Norwegian Nynorsk + nn + + + Norwegian Bokmål + nb + + + Norwegian + no + + + Nyanja + ny + + + Occitan (post 1500) + oc + + + Ojibwa + oj + + + Oriya (macrolanguage) + or + + + Oromo + om + + + Ossetian + os + + + Panjabi + pa + + + Pali + pi + + + Polish + pl + + + Portuguese + pt + + + Pushto + ps + + + Quechua + qu + + + Romansh + rm + + + Romanian + ro + + + Rundi + rn + + + Russian + ru + + + Sango + sg + + + Sanskrit + sa + + + Sinhala + si + + + Slovak + sk + + + Slovenian + sl + + + Northern Sami + se + + + Samoan + sm + + + Shona + sn + + + Sindhi + sd + + + Somali + so + + + Southern Sotho + st + + + Albanian + sq + + + Sardinian + sc + + + Serbian + sr + + + Swati + ss + + + Sundanese + su + + + Swahili (macrolanguage) + sw + + + Swedish + sv + + + Tahitian + ty + + + Tamil + ta + + + Tatar + tt + + + Telugu + te + + + Tajik + tg + + + Tagalog + tl + + + Thai + th + + + Tigrinya + ti + + + Tonga (Tonga Islands) + to + + + Tswana + tn + + + Tsonga + ts + + + Turkmen + tk + + + Turkish + tr + + + Twi + tw + + + Uighur + ug + + + Ukrainian + uk + + + Urdu + ur + + + Uzbek + uz + + + Venda + ve + + + Vietnamese + vi + + + Volapük + vo + + + Walloon + wa + + + Wolof + wo + + + Xhosa + xh + + + Yiddish + yi + + + Yoruba + yo + + + Zhuang + za + + + Chinese + zh + + + Zulu + zu + + + (Other) + other + + + + + Accelerated Breeding + Accelerated Breeding + + + AgriLAC Resiliente + AgriLAC Resiliente + + + Agroecology + Agroecology + + + Aquatic Foods + Aquatic Foods + + + Asian Mega-Deltas + Asian Mega-Deltas + + + Breeding Resources + Breeding Resources + + + Climate Resilience + Climate Resilience + + + Digital Innovation + Digital Innovation + + + Diversification in East and Southern Africa + Diversification in East and Southern Africa + + + Excellence in Agronomy + Excellence in Agronomy + + + Foresight + Foresight + + + Fragility, Conflict, and Migration + Fragility, Conflict, and Migration + + + Fragility to Resilience in Central and West Asia and North Africa + Fragility to Resilience in Central and West Asia and North Africa + + + Fruit and Vegetables for Sustainable Healthy Diets + Fruit and Vegetables for Sustainable Healthy Diets + + + Gender Equality + Gender Equality + + + Genebanks + Genebanks + + + Livestock and Climate + Livestock and Climate + + + Low-Emission Food Systems + Low-Emission Food Systems + + + Market Intelligence + Market Intelligence + + + Mixed Farming Systems + Mixed Farming Systems + + + NEXUS Gains + NEXUS Gains + + + National Policies and Strategies + National Policies and Strategies + + + Nature-Positive Solutions + Nature-Positive Solutions + + + One Health + One Health + + + Plant Health + Plant Health + + + Resilient Cities + Resilient Cities + + + Rethinking Food Markets + Rethinking Food Markets + + + Seed Equal + Seed Equal + + + Sustainable Animal Productivity + Sustainable Animal Productivity + + + Sustainable Healthy Diets + Sustainable Healthy Diets + + + Transforming Agrifood Systems in South Asia + Transforming Agrifood Systems in South Asia + + + West and Central African Food Systems Transformation + West and Central African Food Systems Transformation + + + + + Agriculture for Nutrition and Health + Agriculture for Nutrition and Health + + + Big Data + Big Data + + + Climate Change, Agriculture and Food Security + Climate Change, Agriculture and Food Security + + + Excellence in Breeding + Excellence in Breeding + + + Fish + Fish + + + Forests, Trees and Agroforestry + Forests, Trees and Agroforestry + + + Gender + Gender + + + Genebanks + Genebanks + + + Grain Legumes and Dryland Cereals + Grain Legumes and Dryland Cereals + + + Livestock + Livestock + + + Maize + Maize + + + Policies, Institutions, and Markets + Policies, Institutions, and Markets + + + Rice + Rice + + + Roots, Tubers and Bananas + Roots, Tubers and Bananas + + + Water, Land and Ecosystems + Water, Land and Ecosystems + + + Wheat + Wheat + + + Aquatic Agricultural Systems + Aquatic Agricultural Systems + + + Dryland Cereals + Dryland Cereals + + + Dryland Systems + Dryland Systems + + + Grain Legumes + Grain Legumes + + + Integrated Systems for the Humid Tropics + Integrated Systems for the Humid Tropics + + + Livestock and Fish + Livestock and Fish + + + + + SDG 1 - No poverty + SDG 1 - No poverty + + + SDG 2 - Zero hunger + SDG 2 - Zero hunger + + + SDG 3 - Good health and well-being + SDG 3 - Good health and well-being + + + SDG 4 - Quality education + SDG 4 - Quality education + + + SDG 5 - Gender equality + SDG 5 - Gender equality + + + SDG 6 - Clean water and sanitation + SDG 6 - Clean water and sanitation + + + SDG 7 - Affordable and clean energy + SDG 7 - Affordable and clean energy + + + SDG 8 - Decent work and economic growth + SDG 8 - Decent work and economic growth + + + SDG 9 - Industry, innovation and infrastructure + SDG 9 - Industry, innovation and infrastructure + + + SDG 10 - Reduced inequalities + SDG 10 - Reduced inequalities + + + SDG 11 - Sustainable cities and communities + SDG 11 - Sustainable cities and communities + + + SDG 12 - Responsible consumption and production + SDG 12 - Responsible consumption and production + + + SDG 13 - Climate action + SDG 13 - Climate action + + + SDG 14 - Life below water + SDG 14 - Life below water + + + SDG 15 - Life on land + SDG 15 - Life on land + + + SDG 16 - Peace, justice and strong institutions + SDG 16 - Peace, justice and strong institutions + + + SDG 17 - Partnerships for the goals + SDG 17 - Partnerships for the goals + + + + + Climate Change + Climate Change + + + Environmental Health and Biodiversity + Environmental Health and Biodiversity + + + Gender + Gender + + + Nutrition, Health and Food Security + Nutrition, Health and Food Security + + + Poverty Reduction, Livelihoods and Jobs + Poverty Reduction, Livelihoods and Jobs + + + + + Climate adaptation and mitigation + Climate adaptation and mitigation + + + Environmental health and biodiversity + Environmental health and biodiversity + + + Gender equality, youth and social inclusion + Gender equality, youth and social inclusion + + + Nutrition, health and food security + Nutrition, health and food security + + + Poverty reduction, livelihoods and jobs + Poverty reduction, livelihoods and jobs + + + + + Genetic Innovation + Genetic Innovation + + + Resilient Agrifood Systems + Resilient Agrifood Systems + + + Systems Transformation + Systems Transformation + + + + + A Plus + A Plus + + + A + A + + + B + B + + + C + C + + + Not ranked + Not ranked + + + + + ACP + ACP + + + Africa + Africa + + + Americas + Americas + + + Asia + Asia + + + Australia and New Zealand + Australia and New Zealand + + + Caribbean + Caribbean + + + Central America + Central America + + + Central Asia + Central Asia + + + Channel Islands + Channel Islands + + + Eastern Africa + Eastern Africa + + + Eastern Asia + Eastern Asia + + + Eastern Europe + Eastern Europe + + + Europe + Europe + + + Latin America + Latin America + + + Latin America and the Caribbean + Latin America and the Caribbean + + + Melanesia + Melanesia + + + Micronesia + Micronesia + + + Middle Africa + Middle Africa + + + Middle East + Middle East + + + Northern Africa + Northern Africa + + + Northern America + Northern America + + + Northern Europe + Northern Europe + + + Oceania + Oceania + + + Polynesia + Polynesia + + + Sahel + Sahel + + + South America + South America + + + South-eastern Asia + South-eastern Asia + + + Southern Africa + Southern Africa + + + Southern Asia + Southern Asia + + + Southern Europe + Southern Europe + + + Sub-Saharan Africa + Sub-Saharan Africa + + + West and Central Africa + West and Central Africa + + + Western Africa + Western Africa + + + Western Asia + Western Asia + + + Western Europe + Western Europe + + + + + Afghanistan + Afghanistan + + + Albania + Albania + + + Algeria + Algeria + + + American Samoa + American Samoa + + + Andorra + Andorra + + + Angola + Angola + + + Anguilla + Anguilla + + + Antarctica + Antarctica + + + Antigua and Barbuda + Antigua and Barbuda + + + Argentina + Argentina + + + Armenia + Armenia + + + Aruba + Aruba + + + Australia + Australia + + + Austria + Austria + + + Azerbaijan + Azerbaijan + + + Bahamas + Bahamas + + + Bahrain + Bahrain + + + Bangladesh + Bangladesh + + + Barbados + Barbados + + + Belarus + Belarus + + + Belgium + Belgium + + + Belize + Belize + + + Benin + Benin + + + Bermuda + Bermuda + + + Bhutan + Bhutan + + + Bolivia + Bolivia + + + Bosnia and Herzegovina + Bosnia and Herzegovina + + + Botswana + Botswana + + + Bouvet Island + Bouvet Island + + + Brazil + Brazil + + + Brunei Darussalam + Brunei Darussalam + + + Bulgaria + Bulgaria + + + Burkina Faso + Burkina Faso + + + Burundi + Burundi + + + Cambodia + Cambodia + + + Cameroon + Cameroon + + + Canada + Canada + + + Cabo Verde + Cabo Verde + + + Central African Republic + Central African Republic + + + Chad + Chad + + + Chile + Chile + + + China + China + + + Cocos (Keeling) Islands + Cocos (Keeling) Islands + + + Colombia + Colombia + + + Comoros + Comoros + + + Congo + Congo + + + Congo, Democratic Republic of + Congo, Democratic Republic of + + + Cook Islands + Cook Islands + + + Costa Rica + Costa Rica + + + Côte d'Ivoire + Côte d'Ivoire + + + Croatia + Croatia + + + Cuba + Cuba + + + Cyprus + Cyprus + + + Czech Republic + Czech Republic + + + Denmark + Denmark + + + Djibouti + Djibouti + + + Dominica + Dominica + + + Dominican Republic + Dominican Republic + + + Ecuador + Ecuador + + + Egypt + Egypt + + + El Salvador + El Salvador + + + Equatorial Guinea + Equatorial Guinea + + + Eritrea + Eritrea + + + Estonia + Estonia + + + Ethiopia + Ethiopia + + + Fiji + Fiji + + + Finland + Finland + + + France + France + + + Gabon + Gabon + + + Gambia + Gambia + + + Georgia + Georgia + + + Germany + Germany + + + Ghana + Ghana + + + Greece + Greece + + + Grenada + Grenada + + + Guadeloupe + Guadeloupe + + + Guatemala + Guatemala + + + Guinea + Guinea + + + Guinea-Bissau + Guinea-Bissau + + + Guyana + Guyana + + + Haiti + Haiti + + + Honduras + Honduras + + + Hungary + Hungary + + + Iceland + Iceland + + + India + India + + + Indonesia + Indonesia + + + Iran + Iran + + + Iraq + Iraq + + + Ireland + Ireland + + + Israel + Israel + + + Italy + Italy + + + Jamaica + Jamaica + + + Japan + Japan + + + Jordan + Jordan + + + Kazakhstan + Kazakhstan + + + Kenya + Kenya + + + Kiribati + Kiribati + + + Korea, DPR + Korea, DPR + + + Korea, Republic of + Korea, Republic of + + + Kuwait + Kuwait + + + Kyrgyzstan + Kyrgyzstan + + + Laos + Laos + + + Latvia + Latvia + + + Lebanon + Lebanon + + + Lesotho + Lesotho + + + Liberia + Liberia + + + Libya + Libya + + + Lithuania + Lithuania + + + Luxembourg + Luxembourg + + + North Macedonia + North Macedonia + + + Madagascar + Madagascar + + + Malawi + Malawi + + + Malaysia + Malaysia + + + Maldives + Maldives + + + Mali + Mali + + + Malta + Malta + + + Mauritania + Mauritania + + + Mauritius + Mauritius + + + Marshall Islands + Marshall Islands + + + Mexico + Mexico + + + Moldova + Moldova + + + Mongolia + Mongolia + + + Micronesia (Federated States of) + Micronesia (Federated States of) + + + Montenegro + Montenegro + + + Montserrat + Montserrat + + + Morocco + Morocco + + + Mozambique + Mozambique + + + Myanmar + Myanmar + + + Namibia + Namibia + + + Nepal + Nepal + + + Netherlands + Netherlands + + + New Zealand + New Zealand + + + Nicaragua + Nicaragua + + + Niger + Niger + + + Nigeria + Nigeria + + + Norway + Norway + + + Oman + Oman + + + Pakistan + Pakistan + + + Palestine, State of + Palestine, State of + + + Panama + Panama + + + Papua New Guinea + Papua New Guinea + + + Paraguay + Paraguay + + + Peru + Peru + + + Philippines + Philippines + + + Poland + Poland + + + Portugal + Portugal + + + Qatar + Qatar + + + Romania + Romania + + + Russia + Russia + + + Rwanda + Rwanda + + + Saint Kitts and Nevis + Saint Kitts and Nevis + + + Saint Lucia + Saint Lucia + + + Samoa + Samoa + + + Sao Tome and Principe + Sao Tome and Principe + + + Saudi Arabia + Saudi Arabia + + + Senegal + Senegal + + + Serbia + Serbia + + + Seychelles + Seychelles + + + Sierra Leone + Sierra Leone + + + Singapore + Singapore + + + Slovakia + Slovakia + + + Slovenia + Slovenia + + + Solomon Islands + Solomon Islands + + + Somalia + Somalia + + + South Africa + South Africa + + + South Sudan + South Sudan + + + Spain + Spain + + + Sri Lanka + Sri Lanka + + + Sudan + Sudan + + + Suriname + Suriname + + + Eswatini + Eswatini + + + Sweden + Sweden + + + Switzerland + Switzerland + + + Syria + Syria + + + Taiwan + Taiwan + + + Tajikistan + Tajikistan + + + Tanzania + Tanzania + + + Thailand + Thailand + + + Timor-Leste + Timor-Leste + + + Togo + Togo + + + Tokelau + Tokelau + + + Tonga + Tonga + + + Trinidad and Tobago + Trinidad and Tobago + + + Tunisia + Tunisia + + + Türkiye + Türkiye + + + Turkmenistan + Turkmenistan + + + Uganda + Uganda + + + Ukraine + Ukraine + + + United Arab Emirates + United Arab Emirates + + + United Kingdom + United Kingdom + + + United States + United States + + + Uruguay + Uruguay + + + Uzbekistan + Uzbekistan + + + Vanuatu + Vanuatu + + + Vatican City State + Vatican City State + + + Venezuela + Venezuela + + + Vietnam + Vietnam + + + Yemen + Yemen + + + Zambia + Zambia + + + Zimbabwe + Zimbabwe + + + + + AMAZON + AMAZON + + + ANDES + ANDES + + + GANGES + GANGES + + + INDUS + INDUS + + + KAREKH + KAREKH + + + LIMPOPO + LIMPOPO + + + MEKONG + MEKONG + + + NIGER + NIGER + + + NILE + NILE + + + SAO FRANCISCO + SAO FRANCISCO + + + VOLTA + VOLTA + + + YELLOW + YELLOW + + + + + N/A + + + + Internal Review + Internal Review + + + Peer Review + Peer Review + + + + + N/A + + + + CGIAR single centre + CGIAR single centre + + + CGIAR multi-centre + CGIAR multi-centre + + + CGIAR and developing country institute + CGIAR and developing country institute + + + CGIAR and advanced research institute + CGIAR and advanced research institute + + + Consultant + Consultant + + + Not CGIAR developing country institute + Not CGIAR developing country institute + + + Not CGIAR international institute + Not CGIAR international institute + + + + + ANDEAN ROOTS AND TUBERS + ANDEAN ROOTS AND TUBERS + + + BIGDATA + BIGDATA + + + BIODIVERSITY FOR THE FUTURE + BIODIVERSITY FOR THE FUTURE + + + BIOFORTIFICATION + BIOFORTIFICATION + + + BREEDING + BREEDING + + + CLIMATE CHANGE + CLIMATE CHANGE + + + CLIMATE-SMART AGRICULTURE + CLIMATE-SMART AGRICULTURE + + + CROP PROTECTION + CROP PROTECTION + + + CROP AND SYSTEMS SCIENCES CSS + CROP AND SYSTEMS SCIENCES CSS + + + CRYOPRESERVATION + CRYOPRESERVATION + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + GENDER + GENDER + + + GENEBANK + GENEBANK + + + GENETIC RESOURCES + GENETIC RESOURCES + + + GENETICS, GENOMICS AND CROP IMPROVEMENT SCIENCES GGCI + GENETICS, GENOMICS AND CROP IMPROVEMENT SCIENCES GGCI + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INCLUSIVE GROWTH + INCLUSIVE GROWTH + + + NUTRITION + NUTRITION + + + NUTRITIONAL SECURITY + NUTRITIONAL SECURITY + + + POTATO AGRI-FOOD SYSTEMS + POTATO AGRI-FOOD SYSTEMS + + + POTATOES + POTATOES + + + SEED SYSTEMS + SEED SYSTEMS + + + SOCIAL AND NUTRITIONAL SCIENCES SNS + SOCIAL AND NUTRITIONAL SCIENCES SNS + + + SWEETPOTATOES + SWEETPOTATOES + + + SWEETPOTATO AGRI-FOOD SYSTEMS + SWEETPOTATO AGRI-FOOD SYSTEMS + + + + + ADVOCACY + ADVOCACY + + + AGRICULTURE + AGRICULTURE + + + AGRI-HEALTH + AGRI-HEALTH + + + AGROFORESTRY + AGROFORESTRY + + + AFLATOXINS + AFLATOXINS + + + AMR + AMR + + + ANIMAL BREEDING + ANIMAL BREEDING + + + ANIMAL CARE + ANIMAL CARE + + + ANIMAL DISEASES + ANIMAL DISEASES + + + ANIMAL FEEDING + ANIMAL FEEDING + + + ANIMAL HEALTH + ANIMAL HEALTH + + + ANIMAL PRODUCTION + ANIMAL PRODUCTION + + + ANIMAL PRODUCTS + ANIMAL PRODUCTS + + + ANIMAL WELFARE + ANIMAL WELFARE + + + APICULTURE + APICULTURE + + + ASF + ASF + + + BIODIVERSITY + BIODIVERSITY + + + BIOTECHNOLOGY + BIOTECHNOLOGY + + + BIRD FLU + BIRD FLU + + + BREEDS + BREEDS + + + BRUCELLOSIS + BRUCELLOSIS + + + BUFFALO + BUFFALO + + + BUSHMEAT + BUSHMEAT + + + CAPACITY STRENGTHENING + CAPACITY STRENGTHENING + + + CAMELS + CAMELS + + + CATTLE + CATTLE + + + CBPP + CBPP + + + CHICKENS + CHICKENS + + + CLIMATE CHANGE + CLIMATE CHANGE + + + COMMUNICATIONS + COMMUNICATIONS + + + CONSUMPTION + CONSUMPTION + + + COVID19 + COVID19 + + + CROP RESIDUES + CROP RESIDUES + + + CSF + CSF + + + CROP-LIVESTOCK + CROP-LIVESTOCK + + + CROPS + CROPS + + + DAIRYING + DAIRYING + + + DATA + DATA + + + DIAGNOSTICS + DIAGNOSTICS + + + DIET + DIET + + + DISEASE CONTROL + DISEASE CONTROL + + + DROUGHT + DROUGHT + + + DRYLANDS + DRYLANDS + + + ECF + ECF + + + EMERGING DISEASES + EMERGING DISEASES + + + ENVIRONMENT + ENVIRONMENT + + + EPIDEMIOLOGY + EPIDEMIOLOGY + + + EXTENSION + EXTENSION + + + FARM MANAGEMENT + FARM MANAGEMENT + + + FARMING SYSTEMS + FARMING SYSTEMS + + + FEEDS + FEEDS + + + FISH + FISH + + + FMD + FMD + + + FODDER + FODDER + + + FOOD SAFETY + FOOD SAFETY + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + FORAGES + FORAGES + + + FORESTRY + FORESTRY + + + GENETICS + GENETICS + + + GENETIC RESOURCES + GENETIC RESOURCES + + + GENDER + GENDER + + + GEODATA + GEODATA + + + GHG EMISSIONS + GHG EMISSIONS + + + GOATS + GOATS + + + HUMAN HEALTH + HUMAN HEALTH + + + HIV-AIDS + HIV-AIDS + + + HUMID TROPICS + HUMID TROPICS + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INDIGENOUS BREEDS + INDIGENOUS BREEDS + + + INNOVATION SYSTEMS + INNOVATION SYSTEMS + + + INTENSIFICATION + INTENSIFICATION + + + INSURANCE + INSURANCE + + + IRRIGATION + IRRIGATION + + + KNOWLEDGE AND INFORMATION + KNOWLEDGE AND INFORMATION + + + LIVELIHOODS + LIVELIHOODS + + + LEGUMES + LEGUMES + + + LIVESTOCK + LIVESTOCK + + + LIVESTOCK SYSTEMS + LIVESTOCK SYSTEMS + + + LIVESTOCK-WATER + LIVESTOCK-WATER + + + MARKETS + MARKETS + + + MEAT + MEAT + + + MERS + MERS + + + NRM + NRM + + + NUTRITION + NUTRITION + + + ONE HEALTH + ONE HEALTH + + + PARTICIPATION + PARTICIPATION + + + PASTORALISM + PASTORALISM + + + PESTS + PESTS + + + PIGS + PIGS + + + POULTRY + POULTRY + + + POLICY + POLICY + + + PPR + PPR + + + PRO-POOR LIVESTOCK + PRO-POOR LIVESTOCK + + + RANGELANDS + RANGELANDS + + + REPRODUCTION + REPRODUCTION + + + RESEARCH + RESEARCH + + + RESILIENCE + RESILIENCE + + + RVF + RVF + + + SCALING + SCALING + + + SEEDS + SEEDS + + + SHEEP + SHEEP + + + SMALL RUMINANTS + SMALL RUMINANTS + + + SOCIAL LEARNING + SOCIAL LEARNING + + + SOILS + SOILS + + + TRADE + TRADE + + + TRYPANOSOMIASIS + TRYPANOSOMIASIS + + + VACCINES + VACCINES + + + VALUE CHAINS + VALUE CHAINS + + + VULNERABILITY + VULNERABILITY + + + WATER + WATER + + + WILD MEAT + WILD MEAT + + + WILDLIFE + WILDLIFE + + + WILDLIFE CONSERVATION + WILDLIFE CONSERVATION + + + WOMEN + WOMEN + + + ZOONOTIC DISEASES + ZOONOTIC DISEASES + + + + + AGRICULTURE + AGRICULTURE + + + AGROFORESTRY + AGROFORESTRY + + + BANANA + BANANA + + + BEANS + BEANS + + + BIODIVERSITY + BIODIVERSITY + + + BIOFORTIFICATION + BIOFORTIFICATION + + + CACAO + CACAO + + + CASSAVA + CASSAVA + + + CAPACITY DEVELOPMENT + CAPACITY DEVELOPMENT + + + CLIMATE CHANGE + CLIMATE CHANGE + + + CLIMATE CHANGE ADAPTATION + CLIMATE CHANGE ADAPTATION + + + CLIMATE CHANGE MITIGATION + CLIMATE CHANGE MITIGATION + + + COCONUT + COCONUT + + + CONSERVATION AND USE + CONSERVATION AND USE + + + CROP PRODUCTION + CROP PRODUCTION + + + CROP WILD RELATIVES + CROP WILD RELATIVES + + + DOCUMENTATION + DOCUMENTATION + + + ECOSYSTEM SERVICES + ECOSYSTEM SERVICES + + + ECONOMICS + ECONOMICS + + + EXTENSION + EXTENSION + + + FARMING SYSTEMS + FARMING SYSTEMS + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + FORESTRY + FORESTRY + + + GENDER AND EQUITY + GENDER AND EQUITY + + + GENETIC RESOURCES + GENETIC RESOURCES + + + GERMPLASM CONSERVATION + GERMPLASM CONSERVATION + + + GOVERNANCE + GOVERNANCE + + + HEALTH + HEALTH + + + HOME GARDENS + HOME GARDENS + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INDIGENOUS KNOWLEDGE + INDIGENOUS KNOWLEDGE + + + INFORMATICS + INFORMATICS + + + INFORMATION SYSTEMS + INFORMATION SYSTEMS + + + KNOWLEDGE MANAGEMENT + KNOWLEDGE MANAGEMENT + + + LAND USE + LAND USE + + + LIVELIHOODS + LIVELIHOODS + + + LIVESTOCK + LIVESTOCK + + + MARKETS + MARKETS + + + MODELING + MODELING + + + MONITORING AND REPORTING + MONITORING AND REPORTING + + + NATURAL RESOURCE MANAGEMENT + NATURAL RESOURCE MANAGEMENT + + + NEGLECTED AND UNDERUTILIZED SPECIES + NEGLECTED AND UNDERUTILIZED SPECIES + + + NUTRITION + NUTRITION + + + PARTICIPATORY RESEARCH + PARTICIPATORY RESEARCH + + + PESTS AND DISEASES + PESTS AND DISEASES + + + PLANT BREEDING + PLANT BREEDING + + + PLANT GENETIC RESOURCES + PLANT GENETIC RESOURCES + + + POLICY + POLICY + + + RESILIENCE + RESILIENCE + + + RESTORATION + RESTORATION + + + RICE + RICE + + + RURAL COMMUNITIES + RURAL COMMUNITIES + + + SEED SYSTEMS + SEED SYSTEMS + + + SMALLHOLDER FARMERS + SMALLHOLDER FARMERS + + + SOIL HEALTH + SOIL HEALTH + + + SOIL INFORMATION + SOIL INFORMATION + + + SOIL LANDSCAPES + SOIL LANDSCAPES + + + STANDARDS + STANDARDS + + + SUSTAINABILITY + SUSTAINABILITY + + + TREE CROPS + TREE CROPS + + + TROPICAL FORAGES + TROPICAL FORAGES + + + VALUE CHAINS + VALUE CHAINS + + + WATER + WATER + + + + + AFLATOXIN + AFLATOXIN + + + AGRIBUSINESS + AGRIBUSINESS + + + AGRONOMY + AGRONOMY + + + BANANA + BANANA + + + BASELINE SURVEY + BASELINE SURVEY + + + BIODIVERSITY + BIODIVERSITY + + + BIOFORTIFICATION + BIOFORTIFICATION + + + BIOMETRICS + BIOMETRICS + + + BIOSCIENCE + BIOSCIENCE + + + CAPACITY DEVELOPMENT + CAPACITY DEVELOPMENT + + + CASSAVA + CASSAVA + + + CLIMATE CHANGE + CLIMATE CHANGE + + + COCOA + COCOA + + + COWPEA + COWPEA + + + CROP HUSBANDRY + CROP HUSBANDRY + + + CROP SYSTEMS + CROP SYSTEMS + + + DISEASE CONTROL + DISEASE CONTROL + + + DOMESTIC TRADE + DOMESTIC TRADE + + + FARM MANAGEMENT + FARM MANAGEMENT + + + FARMING SYSTEMS + FARMING SYSTEMS + + + FOOD SCIENCE + FOOD SCIENCE + + + FOOD SECURITY + FOOD SECURITY + + + FOOD SYSTEMS + FOOD SYSTEMS + + + FORESTRY + FORESTRY + + + GENDER + GENDER + + + GENETIC IMPROVEMENT + GENETIC IMPROVEMENT + + + GRAIN LEGUMES + GRAIN LEGUMES + + + HANDLING, TRANSPORT, STORAGE AND PROTECTION OF AGRICULTURAL PRODUCTS + HANDLING, TRANSPORT, STORAGE AND PROTECTION OF AGRICULTURAL PRODUCTS + + + IMPACT ASSESSMENT + IMPACT ASSESSMENT + + + INTEGRATED SOIL FERTILITY MANAGEMENT + INTEGRATED SOIL FERTILITY MANAGEMENT + + + KNOWLEDGE MANAGEMENT + KNOWLEDGE MANAGEMENT + + + LAND USE + LAND USE + + + LIVELIHOODS + LIVELIHOODS + + + MAIZE + MAIZE + + + MARKETS + MARKETS + + + METEOROLOGY AND CLIMATOLOGY + METEOROLOGY AND CLIMATOLOGY + + + NATURAL RESOURCE MANAGEMENT + NATURAL RESOURCE MANAGEMENT + + + NUTRITION + NUTRITION + + + PESTS OF PLANTS + PESTS OF PLANTS + + + PLANT BREEDING + PLANT BREEDING + + + PLANT DISEASES + PLANT DISEASES + + + PLANT ECOLOGY + PLANT ECOLOGY + + + PLANT GENETIC RESOURCES + PLANT GENETIC RESOURCES + + + PLANT HEALTH + PLANT HEALTH + + + PLANT PRODUCTION + PLANT PRODUCTION + + + PLANTAIN + PLANTAIN + + + POLICIES AND INSTITUTIONS + POLICIES AND INSTITUTIONS + + + POST-HARVESTING TECHNOLOGY + POST-HARVESTING TECHNOLOGY + + + RESEARCH METHOD + RESEARCH METHOD + + + SMALLHOLDER FARMERS + SMALLHOLDER FARMERS + + + SOCIOECONOMY + SOCIOECONOMY + + + SOIL FERTILITY + SOIL FERTILITY + + + SOIL HEALTH + SOIL HEALTH + + + SOIL INFORMATION + SOIL INFORMATION + + + SOIL SURVEYS AND MAPPING + SOIL SURVEYS AND MAPPING + + + SOYBEAN + SOYBEAN + + + TISSUE CULTURE + TISSUE CULTURE + + + VALUE CHAINS + VALUE CHAINS + + + WEEDS + WEEDS + + + YAM + YAM + + + + + BIOMETRICS + BIOMETRICS + + + BIOTECH & PLANT BREEDING + BIOTECH & PLANT BREEDING + + + NATURAL RESOURCE MANAGEMENT + NATURAL RESOURCE MANAGEMENT + + + NUTRITION & HUMAN HEALTH + NUTRITION & HUMAN HEALTH + + + PLANT PRODUCTION & HEALTH + PLANT PRODUCTION & HEALTH + + + SOCIAL SCIENCE & AGRIBUSINESS + SOCIAL SCIENCE & AGRICUSINESS + + + + + N/A + + + + Open Access + Open Access + + + Limited Access + Limited Access + + + + + Choose One + + + + Creative Commons Attribution 4.0 (CC BY 4.0) + CC-BY-4.0 + + + Creative Commons Attribution-ShareAlike 4.0 (CC BY-SA 4.0) + CC-BY-SA-4.0 + + + Creative Commons Attribution-NoDerivatives 4.0 (CC BY-ND 4.0) + CC-BY-ND-4.0 + + + Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0) + CC-BY-NC-4.0 + + + Creative Commons Attribution-NonCommercial-ShareAlike 4.0 (CC BY-NC-SA 4.0) + CC-BY-NC-SA-4.0 + + + Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 (CC BY-NC-ND 4.0) + CC-BY-NC-ND-4.0 + + + Creative Commons Attribution 3.0 (CC BY 3.0) + CC-BY-3.0 + + + Creative Commons Attribution-ShareAlike 3.0 (CC BY-SA 3.0) + CC-BY-SA-3.0 + + + Creative Commons Attribution-NoDerivs 3.0 (CC BY-ND 3.0) + CC-BY-ND-3.0 + + + Creative Commons Attribution-NonCommercial 3.0 (CC BY-NC 3.0) + CC-BY-NC-3.0 + + + Creative Commons Attribution-NonCommercial-ShareAlike 3.0 (CC BY-NC-SA 3.0) + CC-BY-NC-SA-3.0 + + + Creative Commons Attribution-NonCommercial-NoDerivs 3.0 (CC BY-NC-ND 3.0) + CC-BY-NC-ND-3.0 + + + Creative Commons Attribution 3.0 IGO (CC BY 3.0 IGO) + CC-BY-3.0-IGO + + + Creative Commons Attribution-ShareAlike 3.0 IGO (CC BY-SA 3.0 IGO) + CC-BY-SA-3.0-IGO + + + Creative Commons Attribution-NoDerivs 3.0 IGO (CC BY-ND 3.0 IGO) + CC-BY-ND-3.0-IGO + + + Creative Commons Attribution-NonCommercial 3.0 IGO (CC BY-NC 3.0 IGO) + CC-BY-NC-3.0-IGO + + + Creative Commons Attribution-NonCommercial-ShareAlike 3.0 IGO (CC BY-NC-SA 3.0 IGO) + CC-BY-NC-SA-3.0-IGO + + + Creative Commons Attribution-NonCommercial-NoDerivs 3.0 IGO (CC BY-NC-ND 3.0 IGO) + CC-BY-NC-ND-3.0-IGO + + + + Creative Commons Zero Public Domain Dedication 1.0 (CC0 1.0) + CC0-1.0 + + + Creative Commons Attribution No Version (CC BY) + CC-BY + + + Creative Commons Attribution-ShareAlike No Version (CC BY-SA) + CC-BY-SA + + + Creative Commons Attribution-NoDerivatives No Version (CC BY-ND) + CC-BY-ND + + + Creative Commons Attribution-NonCommercial No Version (CC BY-NC) + CC-BY-NC + + + Creative Commons Attribution-NonCommercial-ShareAlike No Version (CC BY-NC-SA) + CC-BY-NC-SA + + + Creative Commons Attribution-NonCommercial-NoDerivatives No Version (CC BY-NC-ND) + CC-BY-NC-ND + + + Open Government Licence v3.0 (OGL-UK-3.0) + OGL-UK-3.0 + + + GNU General Public License v3.0 (GPL-3.0-only) + GPL-3.0-only + + + MIT License + MIT + + + Copyrighted; all rights reserved + Copyrighted; all rights reserved + + + Copyrighted; Non-commercial educational use only + Copyrighted; Non-commercial educational use only + + + Copyrighted; Non-commercial use only + Copyrighted; Non-commercial use only + + + All rights reserved; self-archive copy only + All rights reserved; self-archive copy only + + + All rights reserved; no re-use allowed + All rights reserved; no re-use allowed + + + Other + Other + + + + + Non-ISI Journal + + + + ISI Journal + ISI Journal + + + + + N/A + + + + Grey Literature + Grey Literature + + + Formally Published + Formally Published + + + + + Academics + Academics + + + CGIAR + CGIAR + + + Development Practitioners + Development Practitioners + + + Donors + Donors + + + Extension + Extension + + + Farmers + Farmers + + + General Public + General Public + + + NGOs + NGOs + + + Policy Makers + Policy Makers + + + Scientists + Scientists + + + diff --git a/dspace/config/workflow-curation.xml b/dspace/config/workflow-curation.xml index 89cfd5309f2a..24cdb98d32d4 100644 --- a/dspace/config/workflow-curation.xml +++ b/dspace/config/workflow-curation.xml @@ -9,7 +9,7 @@ - + @@ -62,7 +62,20 @@ - + + + + + + + + + + + + + + diff --git a/dspace/modules/additions/pom.xml b/dspace/modules/additions/pom.xml index 7e60e982ec45..71a918cf1a70 100644 --- a/dspace/modules/additions/pom.xml +++ b/dspace/modules/additions/pom.xml @@ -286,6 +286,11 @@ mockito-inline test
+ + io.github.ilri.cgspace + cgspace-java-helpers + 7.6.1.2-SNAPSHOT + diff --git a/dspace/solr/authority/conf/solrconfig.xml b/dspace/solr/authority/conf/solrconfig.xml index 21f917ebf8ca..10d503a260b9 100644 --- a/dspace/solr/authority/conf/solrconfig.xml +++ b/dspace/solr/authority/conf/solrconfig.xml @@ -50,7 +50,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/dspace/solr/oai/conf/solrconfig.xml b/dspace/solr/oai/conf/solrconfig.xml index ce8d9ebe2060..30d1cd055861 100644 --- a/dspace/solr/oai/conf/solrconfig.xml +++ b/dspace/solr/oai/conf/solrconfig.xml @@ -59,7 +59,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/dspace/solr/search/conf/solrconfig.xml b/dspace/solr/search/conf/solrconfig.xml index 97b1d1ddbbf6..2a200dc9c172 100644 --- a/dspace/solr/search/conf/solrconfig.xml +++ b/dspace/solr/search/conf/solrconfig.xml @@ -70,7 +70,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/dspace/solr/statistics/conf/solrconfig.xml b/dspace/solr/statistics/conf/solrconfig.xml index 2b1cff45373d..abfb7d7e17b2 100644 --- a/dspace/solr/statistics/conf/solrconfig.xml +++ b/dspace/solr/statistics/conf/solrconfig.xml @@ -59,7 +59,7 @@ - ${solr.max.booleanClauses:1024} + ${solr.max.booleanClauses:2048} diff --git a/ilri/add_dc_rights.py b/ilri/add_dc_rights.py new file mode 100755 index 000000000000..7ba1cd0f8667 --- /dev/null +++ b/ilri/add_dc_rights.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# +# add-dc-rights.py 1.1.2 +# +# Copyright Alan Orth. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# --- +# +# Add usage rights (dc.rights) to items from CSV. +# +# This script searches for items by handle and adds a dc.rights field to each +# (assuming one does not exist). The format of the CSV file should be: +# +# dc.rights,handle +# CC-BY-NC-ND,10568/72643 +# CC-BY-NC-ND,10568/72644 +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (for example, in a virtual environment): +# +# $ pip install colorama psycopg2-binary +# + +import argparse +import csv +import signal +import sys + +import psycopg2 +from colorama import Fore + + +def main(): + # parse the command line arguments + parser = argparse.ArgumentParser(description="Add usage rights to items from CSV.") + parser.add_argument( + "-i", + "--csv-file", + help="CSV file containing item handles and rights.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument("-db", "--database-name", help="Database name", required=True) + parser.add_argument( + "-u", "--database-user", help="Database username", required=True + ) + parser.add_argument( + "-p", "--database-pass", help="Database password", required=True + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "-hf", + "--handle-field-name", + help='Name of column with handles in "10568/4" format (no URL).', + default="handle", + ) + parser.add_argument( + "-rf", + "--rights-field-name", + help="Name of column with usage rights.", + default="dc.rights", + ) + args = parser.parse_args() + + # set the signal handler for SIGINT (^C) so we can exit cleanly + signal.signal(signal.SIGINT, signal_handler) + + # connect to database + try: + conn_string = "dbname={database_name} user={database_user} password={database_pass} host=localhost".format( + database_name=args.database_name, + database_user=args.database_user, + database_pass=args.database_pass, + ) + conn = psycopg2.connect(conn_string) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Connected to the database.\n" + Fore.RESET) + except psycopg2.OperationalError: + sys.stderr.write(Fore.RED + "Unable to connect to the database.\n" + Fore.RESET) + + # close output file before we exit + args.csv_file.close() + + exit(1) + + # open the CSV + reader = csv.DictReader(args.csv_file) + + # iterate over rows in the CSV + for row in reader: + handle = row[args.handle_field_name] + rights = row[args.rights_field_name] + + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Finding item with handle {handle}\n".format(handle=handle) + + Fore.RESET + ) + + with conn: + # cursor will be closed after this block exits + # see: http://initd.org/psycopg/docs/usage.html#with-statement + with conn.cursor() as cursor: + # get resource_id for current handle + sql = "SELECT resource_id FROM handle WHERE handle=%s" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (handle,)) + + # no resource_id with this handle exists + if cursor.rowcount == 0: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Did not find item with handle {handle}, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + + continue + + # multiple resource_id with this handle exist (I don't think this will happen, but better to check) + elif cursor.rowcount > 1: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found multiple items with handle {handle}, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + + continue + + result = cursor.fetchone() + # result will be an array like: [74525] + resource_id = result[0] + + # in our test environment I've seen resource_id be NULL for some reason + if resource_id is None: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Item with handle {handle} does not have a resource_id, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + + continue + + # Check if this item already has dc.rights metadata + # resource_type_id 2 is for item metadata, metadata_field_id 53 is dc.rights + sql = "SELECT text_value FROM metadatavalue WHERE resource_type_id=2 AND resource_id=%s AND metadata_field_id=53" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (resource_id,)) + + # if rowcount is greater than 0 there must be existing rights for this item + if cursor.rowcount > 0: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found existing rights metadata for item with handle {handle}, skipping.\n".format( + handle=handle + ) + + Fore.RESET + ) + continue + + # no existing rights metadata, so add one + result = cursor.fetchone() + + if args.dry_run: + print( + Fore.GREEN + + 'Would add rights "{rights}" to item with handle {handle}.\n'.format( + rights=rights, handle=handle + ) + + Fore.RESET + ) + continue + + if args.debug: + sys.stderr.write( + Fore.GREEN + + 'Adding rights "{rights}" to item with handle {handle}.\n'.format( + rights=rights, handle=handle + ) + + Fore.RESET + ) + + # metadatavalue IDs come from a PostgreSQL sequence that increments when you call it + cursor.execute("SELECT nextval('metadatavalue_seq')") + metadata_value_id = cursor.fetchone()[0] + + # resource_type_id 2 is for item metadata, metadata_field_id 53 is dc.rights + sql = "INSERT INTO metadatavalue (metadata_value_id, resource_id, metadata_field_id, text_value, place, confidence, resource_type_id) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute( + sql, (metadata_value_id, resource_id, 53, rights, 1, -1, 2) + ) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Disconnecting from database.\n" + Fore.RESET) + + # close the database connection before leaving + conn.close() + + # close output file before we exit + args.csv_file.close() + + +def signal_handler(signal, frame): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ilri/add_orcid_identifiers_csv.py b/ilri/add_orcid_identifiers_csv.py new file mode 100755 index 000000000000..b4973b3a05f7 --- /dev/null +++ b/ilri/add_orcid_identifiers_csv.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# +# add-orcid-identifiers-csv.py v1.1.6 +# +# Copyright Alan Orth. + +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Add ORCID identifiers to items for a given author name from CSV. +# +# We had previously migrated the ORCID identifiers from CGSpace's authority Solr +# core to cg.creator.identifier fields in matching items, but now we want to add +# them to # other matching items in a more arbitrary fashion. Items that are ol- +# der or were uploaded in batch did not have matching authors in the authority +# core, so they did not benefit from that migration, for example. +# +# This script searches for items by author name and adds a cg.creator.identifier +# field to each (assuming one does not exist). The format of the CSV file should +# be: +# +# dc.contributor.author,cg.creator.identifier +# "Orth, Alan",Alan S. Orth: 0000-0002-1735-7458 +# "Orth, A.",Alan S. Orth: 0000-0002-1735-7458 +# +# The order of authors in dc.contributor.author is respected and mirrored in the +# new cg.creator.identifier fields. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama +# + +import argparse +import csv +import logging +import re +import signal +import sys + +import util +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def main(): + # parse the command line arguments + parser = argparse.ArgumentParser( + description="Add ORCID identifiers to items for a given author name from CSV. Respects the author order from the dc.contributor.author field." + ) + parser.add_argument( + "--author-field-name", + "-f", + help="Name of column with author names.", + default="dc.contributor.author", + ) + parser.add_argument( + "--csv-file", + "-i", + help="CSV file containing author names and ORCID identifiers.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument("--database-name", "-db", help="Database name", required=True) + parser.add_argument( + "--database-user", "-u", help="Database username", required=True + ) + parser.add_argument( + "--database-pass", "-p", help="Database password", required=True + ) + parser.add_argument( + "--debug", + "-d", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "--dry-run", + "-n", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "--orcid-field-name", + "-o", + help='Name of column with creators in "Name: 0000-0000-0000-0000" format.', + default="cg.creator.identifier", + ) + args = parser.parse_args() + + # The default log level is WARNING, but we want to set it to DEBUG or INFO + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + # Set the global log format + logging.basicConfig(format="[%(levelname)s] %(message)s") + + # set the signal handler for SIGINT (^C) so we can exit cleanly + signal.signal(signal.SIGINT, signal_handler) + + # connect to database + conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" + ) + + cursor = conn.cursor() + + # open the CSV + reader = csv.DictReader(args.csv_file) + + # iterate over rows in the CSV + for row in reader: + author_name = row[args.author_field_name] + + logger.debug( + Fore.GREEN + f"Finding items with author name: {author_name}" + Fore.RESET + ) + + # find all item metadata records with this author name + # metadata_field_id 3 is author + sql = "SELECT dspace_object_id, place FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=3 AND text_value=%s" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (author_name,)) + records_with_author_name = cursor.fetchall() + + if len(records_with_author_name) > 0: + logger.debug( + Fore.GREEN + + f"> Found {len(records_with_author_name)} items." + + Fore.RESET + ) + + # extract cg.creator.identifier text to add from CSV and strip leading/trailing whitespace + text_value = row[args.orcid_field_name].strip() + # extract the ORCID identifier from the cg.creator.identifier text field in the CSV + orcid_identifier_pattern = re.compile( + r"[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}" + ) + orcid_identifier_match = orcid_identifier_pattern.search(text_value) + + # sanity check to make sure we extracted the ORCID identifier from the cg.creator.identifier text in the CSV + if orcid_identifier_match is None: + logger.debug( + Fore.YELLOW + + f'Skipping invalid ORCID identifier in "{text_value}".' + + Fore.RESET + ) + continue + + # we only expect one ORCID identifier, so if it matches it will be group "0" + # see: https://docs.python.org/3/library/re.html + orcid_identifier = orcid_identifier_match.group(0) + + # iterate over results for current author name to add cg.creator.identifier metadata + for record in records_with_author_name: + dspace_object_id = record[0] + # "place" is the order of a metadata value so we can add the cg.creator.identifier metadata matching the author order + place = record[1] + confidence = -1 + + # get the metadata_field_id for the cg.creator.identifier field + metadata_field_id = util.field_name_to_field_id( + cursor, "cg.creator.identifier" + ) + + # check if there is an existing cg.creator.identifier with this author's ORCID identifier for this item (without restricting the "place") + # note that the SQL here is quoted differently to allow us to use LIKE with % wildcards with our paremeter subsitution + sql = "SELECT * from metadatavalue WHERE dspace_object_id=%s AND metadata_field_id=%s AND text_value LIKE '%%' || %s || '%%' AND confidence=%s AND dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn)" + + cursor.execute( + sql, + ( + dspace_object_id, + metadata_field_id, + orcid_identifier, + confidence, + ), + ) + records_with_orcid_identifier = cursor.fetchall() + + if len(records_with_orcid_identifier) == 0: + if args.dry_run: + logger.info( + Fore.YELLOW + + f'(DRY RUN) Adding ORCID identifier "{text_value}" to item {dspace_object_id}' + + Fore.RESET + ) + + continue + + logger.info( + Fore.YELLOW + + f'Adding ORCID identifier "{text_value}" to item {dspace_object_id}' + + Fore.RESET + ) + + # metadatavalue IDs come from a PostgreSQL sequence that increments when you call it + cursor.execute("SELECT nextval('metadatavalue_seq')") + metadata_value_id = cursor.fetchone()[0] + + sql = "INSERT INTO metadatavalue (metadata_value_id, dspace_object_id, metadata_field_id, text_value, place, confidence) VALUES (%s, %s, %s, %s, %s, %s)" + cursor.execute( + sql, + ( + metadata_value_id, + dspace_object_id, + metadata_field_id, + text_value, + place, + confidence, + ), + ) + + # Update the last_modified date for each item + util.update_item_last_modified(cursor, dspace_object_id) + else: + logger.debug( + Fore.GREEN + + f"Item {dspace_object_id} already has an ORCID identifier for {text_value}." + + Fore.RESET + ) + + logger.debug("Disconnecting from database.") + + # commit the changes + if not args.dry_run: + conn.commit() + + # close the database connection before leaving + conn.close() + + # close output file before we exit + args.csv_file.close() + + +def signal_handler(signal, frame): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ilri/agrovoc_lookup.py b/ilri/agrovoc_lookup.py new file mode 100755 index 000000000000..ea267d6d6e9c --- /dev/null +++ b/ilri/agrovoc_lookup.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +# +# agrovoc-lookup.py 0.4.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public AGROVOC REST API for subjects read from a text file. Text +# file should have one subject per line. Results are saved to a CSV including +# the subject, the language, the match type, and the total number of matches. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read subjects from a text file, one per line +def read_subjects_from_file(): + # initialize an empty list for subjects + subjects = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add subjects that aren't already present + if line not in subjects: + subjects.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_subjects(subjects) + + +def resolve_subjects(subjects): + fieldnames = ["subject", "language", "match type", "number of matches"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with thirty days expiry, as AGROVOC only + # makes new releases monthly so this should be safe. + expire_after = timedelta(days=30) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for subject in subjects: + if args.debug: + sys.stderr.write( + Fore.GREEN + + f"Looking up the subject: {subject} ({'any' or args.language})\n" + + Fore.RESET + ) + + request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" + request_params = {"query": subject} + + if args.language: + # use user specified language + request_params.update(lang=args.language) + + request = requests.get(request_url, params=request_params) + + if request.status_code == requests.codes.ok: + data = request.json() + + # Assume no match + matched = False + + number_of_matches = len(data["results"]) + + # no results means no match + if number_of_matches == 0: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {subject!r} in AGROVOC (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow( + { + "subject": subject, + "language": "", + "match type": "", + "number of matches": number_of_matches, + } + ) + elif number_of_matches >= 1: + for result in request.json()["results"]: + # if there is more than one result we need to check each for + # a preferred or matchedPreLabel match first. If there are + # none then we can check each result again for an altLabel + # matches.alternate match. Note that we need to make sure + # they actually exist before attempting to reference them. + # If they don't exist then we'll catch the exception and set + # the values to None. + # + # Note that matchedPrefLabel is not a property in the SKOS/ + # SKOSXL vocabulary. It seems to be a hint returned by the + # SKOSMOS server to indicate that the search term matched + # the prefLabel of some language. + try: + result["prefLabel"] + except KeyError: + result["prefLabel"] = None + + try: + result["matchedPrefLabel"] + except KeyError: + result["matchedPrefLabel"] = None + + # upper case our subject and the AGROVOC result to make sure + # we're comparing the same thing because AGROVOC returns the + # title case like "Iran" no matter whether you search for + # "IRAN" or "iran". + if ( + result["prefLabel"] + and subject.upper() == result["prefLabel"].upper() + ): + matched = True + language = result["lang"] + print( + f"Match for {subject!r} in AGROVOC {language} (cached: {request.from_cache})" + ) + + writer.writerow( + { + "subject": subject, + "language": language, + "match type": "prefLabel", + "number of matches": number_of_matches, + } + ) + + break + elif ( + result["matchedPrefLabel"] + and subject.upper() == result["matchedPrefLabel"].upper() + ): + matched = True + language = result["lang"] + print( + f"Match for {subject!r} in AGROVOC {language} (cached: {request.from_cache})" + ) + + writer.writerow( + { + "subject": subject, + "language": language, + "match type": "prefLabel", + "number of matches": number_of_matches, + } + ) + + break + + # If we're here we assume there were no matches for prefLabel or + # matchedPrefLabel in the results, so now we will check for an + # altLabel match. + if not matched: + for result in request.json()["results"]: + # make sure key exists before trying to access it + try: + result["altLabel"] + except KeyError: + result["altLabel"] = None + + if ( + result["altLabel"] + and subject.upper() == result["altLabel"].upper() + ): + matched = True + language = result["lang"] + print( + f"Match for {subject!r} in AGROVOC {language} (cached: {request.from_cache})" + ) + + writer.writerow( + { + "subject": subject, + "language": language, + "match type": "altLabel", + "number of matches": number_of_matches, + } + ) + + break + + # close output files before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output files before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the AGROVOC REST API to validate subject terms from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing subject terms to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-l", "--language", help="Language to query terms (example en, default any)." +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the addresses from there +if args.input_file: + read_subjects_from_file() + +exit() diff --git a/ilri/bing-networks-to-ips.sh b/ilri/bing-networks-to-ips.sh new file mode 100755 index 000000000000..e98ec716939e --- /dev/null +++ b/ilri/bing-networks-to-ips.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# +# Latest as of 2022-07-06. For printing the IPs in each CIDR network so that I +# can purge them all from Solr statistics using check-spider-ip-hits.sh. + +BINGBOT_JSON_URL=https://www.bing.com/toolbox/bingbot.json +# Extract the networks from the JSON (I wrote this using https://jqplay.org/) +BINGBOT_NETWORKS=$(http "$BINGBOT_JSON_URL" \ + | jq --raw-output '.["prefixes"][].ipv4Prefix') + +for network in $BINGBOT_NETWORKS; do + # Use prips to print IPs in given CIDR and strip network and broadcast. + # See: https://stackoverflow.com/a/52501093/1996540 + prips "$network" | sed -e '1d; $d' +done diff --git a/ilri/check-spider-hits.sh b/ilri/check-spider-hits.sh new file mode 100755 index 000000000000..9d6cc2bab2f9 --- /dev/null +++ b/ilri/check-spider-hits.sh @@ -0,0 +1,237 @@ +#!/usr/bin/env bash +# +# check-spider-hits.sh v1.2.0 +# +# Copyright (C) 2019-2020 Alan Orth +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Exit on first error +set -o errexit + +# defaults +readonly DEF_SPIDERS_PATTERN_FILE=/dspace/config/spiders/agents/example +readonly DEF_SOLR_URL=http://localhost:8081/solr +readonly DEF_STATISTICS_SHARD=statistics + +###### + +readonly PROGNAME=$(basename $0) +readonly ARGS="$@" + +function usage() { + cat <<-EOF +Usage: $PROGNAME [-d] [-f $DEF_SPIDERS_PATTERN_FILE] [-p] [-s $DEF_STATISTICS_SHARD] [-u $DEF_SOLR_URL] + +Optional arguments: + -d: print debug messages + -f: path to file containing spider user agent patterns¹ (default: $DEF_SPIDERS_PATTERN_FILE) + -p: purge statistics that match spider user agents + -s: Solr statistics shard, for example statistics or statistics-2018² (default: $DEF_STATISTICS_SHARD) + -u: URL to Solr (default: $DEF_SOLR_URL) + +Written by: Alan Orth + +¹ DSpace ships an "example" pattern file that works well. Another option is the patterns file maintained by the COUNTER-Robots project. +² If your statistics core has been split into yearly "shards" by DSpace's stats-util you need to search each shard separately. +EOF + + exit 0 +} + +function parse_options() { + while getopts ":df:ps:u:" opt; do + case $opt in + d) + DEBUG=yes + ;; + f) + SPIDERS_PATTERN_FILE=$OPTARG + + if ! [[ -r "$SPIDERS_PATTERN_FILE" ]]; then + echo "(ERROR) Spider patterns file \"$SPIDERS_PATTERN_FILE\" doesn't exist." + + exit 1 + fi + ;; + p) + PURGE_SPIDER_HITS=yes + ;; + s) + STATISTICS_SHARD=$OPTARG + ;; + u) + # make sure -s is passed something like a URL + if ! [[ "$OPTARG" =~ ^https?://.*$ ]]; then + usage + fi + + SOLR_URL=$OPTARG + ;; + \?|:) + usage + ;; + esac + done +} + +function envsetup() { + # check to see if user specified a Solr URL + # ... otherwise use the default + if [[ -z $SOLR_URL ]]; then + SOLR_URL=$DEF_SOLR_URL + fi + + # check to see if user specified a spiders pattern file + # ... otherwise use the default + if [[ -z $SPIDERS_PATTERN_FILE ]]; then + SPIDERS_PATTERN_FILE=$DEF_SPIDERS_PATTERN_FILE + fi + + # check to see if user specified Solr statistics shards + # ... otherwise use the default + if [[ -z $STATISTICS_SHARD ]]; then + STATISTICS_SHARD=$DEF_STATISTICS_SHARD + fi +} + +# pass the shell's argument array to the parsing function +parse_options $ARGS + +# set up the defaults +envsetup + +[[ $DEBUG ]] && echo "(DEBUG) Using spiders pattern file: $SPIDERS_PATTERN_FILE" + + +# Make a temporary copy of the spider file so we can do pattern replacement +# inside it with sed rather than using stdout from sed and having to deal +# with spaces and newlines in bash. +SPIDERS_PATTERN_FILE_TEMP=$(mktemp) +cp "$SPIDERS_PATTERN_FILE" "$SPIDERS_PATTERN_FILE_TEMP" + +# Read list of spider user agents from the patterns file, converting PCRE-style +# regular expressions to a format that is easier to deal with in bash (spaces!) +# and that Solr supports (ie, patterns are anchored by ^ and $ implicitly, and +# some character types like \d are not supported). +# +# See: https://1opensourcelover.wordpress.com/2013/09/29/solr-regex-tutorial/ +# +# For now this seems to be enough: +# - Replace \s with a literal space +# - Replace \d with [0-9] character class +# - Unescape dashes +# - Escape @ +# +sed -i -e 's/\\s/ /g' -e 's/\\d/[0-9]/g' -e 's/\\-/-/g' -e 's/@/\\@/g' $SPIDERS_PATTERN_FILE_TEMP + +# Start a tally of bot hits so we can report the total at the end +BOT_HITS=0 + +while read -r spider; do + # Save the original pattern so we can inform the user later + original_spider=$spider + + # Skip patterns that contain a plus or percent sign (+ or %) because they + # are tricky to deal with in Solr. For some reason escaping them seems to + # work for searches, but not for deletes. I don't have time to figure it + # out. + if [[ $spider =~ [%\+] ]]; then + [[ $DEBUG ]] && echo "(DEBUG) Skipping spider: $original_spider" + continue + fi + + + unset has_beginning_anchor + unset has_end_anchor + + # Remove ^ at the beginning because it is implied in Solr's regex search + if [[ $spider =~ ^\^ ]]; then + spider=$(echo $spider | sed -e 's/^\^//') + + # Record that this spider's original user agent pattern had a ^ + has_beginning_anchor=yes + fi + + # Remove $ at the end because it is implied in Solr's regex search + if [[ $spider =~ \$ ]]; then + spider=$(echo $spider | sed -e 's/\$$//') + + # Record that this spider's original user agent pattern had a $ + has_end_anchor=yes + fi + + # If the original pattern did NOT have a beginning anchor (^), then add a + # wildcard at the beginning. + if [[ -z $has_beginning_anchor ]]; then + spider=".*$spider" + fi + + # If the original pattern did NOT have an ending enchor ($), then add a + # wildcard at the end. + if [[ -z $has_end_anchor ]]; then + spider="$spider.*" + fi + + [[ $DEBUG ]] && echo "(DEBUG) Checking for hits from spider: $original_spider" + + # Check for hits from this spider in Solr and save results into a variable, + # setting a custom curl output format so I can get the HTTP status code and + # Solr response in one request, then tease them out later. + solr_result=$(curl -s -w "http_code=%{http_code}" "$SOLR_URL/$STATISTICS_SHARD/select" -d "q=userAgent:/$spider/&rows=0") + + http_code=$(echo $solr_result | grep -o -E 'http_code=[0-9]+' | awk -F= '{print $2}') + + # Check the Solr HTTP response code and skip spider if not successful + if [[ $http_code -ne 200 ]]; then + [[ $DEBUG ]] && echo "(DEBUG) Solr query returned HTTP $http_code, skipping $original_spider." + + continue + fi + + # lazy extraction of Solr numFound (relies on sed -E for extended regex) + numFound=$(echo $solr_result | sed -E 's/\s+http_code=[0-9]+//' | xmllint --format - | grep numFound | sed -E 's/^.*numFound="([0-9]+)".*$/\1/') + + if [[ numFound -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo "Purging $numFound hits from $original_spider in $STATISTICS_SHARD" + + # Purge the hits and soft commit + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "userAgent:/$spider/" > /dev/null 2>&1 + else + echo "Found $numFound hits from $original_spider in $STATISTICS_SHARD" + fi + + BOT_HITS=$((BOT_HITS+numFound)) + fi +done < "$SPIDERS_PATTERN_FILE_TEMP" + +if [[ $BOT_HITS -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo + echo "Total number of bot hits purged: $BOT_HITS" + + # Hard commit after we're done processing all spiders + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?commit=true" > /dev/null 2>&1 + else + echo + echo "Total number of hits from bots: $BOT_HITS" + fi +fi + +if [[ -f "$SPIDERS_PATTERN_FILE_TEMP" ]]; then + rm "$SPIDERS_PATTERN_FILE_TEMP" +fi + +# vim: set expandtab:ts=4:sw=4:bs=2 diff --git a/ilri/check-spider-ip-hits.sh b/ilri/check-spider-ip-hits.sh new file mode 100755 index 000000000000..f990c0c310cc --- /dev/null +++ b/ilri/check-spider-ip-hits.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +# +# check-spider-ip-hits.sh v0.0.2 +# +# Copyright (C) 2020 Alan Orth +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Exit on first error +set -o errexit + +# defaults +readonly DEF_SPIDER_IPS_FILE=/dspace/config/spiders/agents/example +readonly DEF_SOLR_URL=http://localhost:8081/solr +readonly DEF_STATISTICS_SHARD=statistics + +###### + +readonly PROGNAME=$(basename $0) +readonly ARGS="$@" + +function usage() { + cat <<-EOF +Usage: $PROGNAME [-d] [-f $DEF_SPIDER_IPS_FILE] [-p] [-s $DEF_STATISTICS_SHARD] [-u $DEF_SOLR_URL] + +Optional arguments: + -d: print debug messages + -f: path to file containing spider IP addresses (default: $DEF_SPIDER_IPS_FILE) + -p: purge statistics that match spider user agents + -s: Solr statistics shard, for example statistics or statistics-2018¹ (default: $DEF_STATISTICS_SHARD) + -u: URL to Solr (default: $DEF_SOLR_URL) + +Written by: Alan Orth + +¹ If your statistics core has been split into yearly "shards" by DSpace's stats-util you need to search each shard separately. +EOF + + exit 0 +} + +function parse_options() { + while getopts ":df:ps:u:" opt; do + case $opt in + d) + DEBUG=yes + ;; + f) + SPIDER_IPS_FILE=$OPTARG + + if ! [[ -r "$SPIDER_IPS_FILE" ]]; then + echo "(ERROR) Spider IPs file \"$SPIDER_IPS_FILE\" doesn't exist." + + exit 1 + fi + ;; + p) + PURGE_SPIDER_HITS=yes + ;; + s) + STATISTICS_SHARD=$OPTARG + ;; + u) + # make sure -s is passed something like a URL + if ! [[ "$OPTARG" =~ ^https?://.*$ ]]; then + usage + fi + + SOLR_URL=$OPTARG + ;; + \?|:) + usage + ;; + esac + done +} + +function envsetup() { + # check to see if user specified a Solr URL + # ... otherwise use the default + if [[ -z $SOLR_URL ]]; then + SOLR_URL=$DEF_SOLR_URL + fi + + # check to see if user specified a spiders pattern file + # ... otherwise use the default + if [[ -z $SPIDER_IPS_FILE ]]; then + SPIDER_IPS_FILE=$DEF_SPIDER_IPS_FILE + fi + + # check to see if user specified Solr statistics shards + # ... otherwise use the default + if [[ -z $STATISTICS_SHARD ]]; then + STATISTICS_SHARD=$DEF_STATISTICS_SHARD + fi +} + +# pass the shell's argument array to the parsing function +parse_options $ARGS + +# set up the defaults +envsetup + +[[ $DEBUG ]] && echo "(DEBUG) Using spider IPs file: $SPIDER_IPS_FILE" + +# Read list of spider IPs, escaping colons in IPv6 address and skipping blank +# lines and comments (#). +IPS=$(sed -e 's/\:/\\:/g' $SPIDER_IPS_FILE | grep -v -E '^$' | grep -v '#') + +# Start a tally of bot hits so we can report the total at the end +BOT_HITS=0 + +for ip in $IPS; do + [[ $DEBUG ]] && echo "(DEBUG) Checking for hits from spider IP: $ip" + + # Check for hits from this spider in Solr and save results into a variable, + # setting a custom curl output format so I can get the HTTP status code and + # Solr response in one request, then tease them out later. + solr_result=$(curl -s -w "http_code=%{http_code}" "$SOLR_URL/$STATISTICS_SHARD/select" -d "q=ip:/$ip/&rows=0") + + http_code=$(echo $solr_result | grep -o -E 'http_code=[0-9]+' | awk -F= '{print $2}') + + # Check the Solr HTTP response code and skip spider if not successful + if [[ $http_code -ne 200 ]]; then + [[ $DEBUG ]] && echo "(DEBUG) Solr query returned HTTP $http_code, skipping $ip." + + continue + fi + + # lazy extraction of Solr numFound (relies on sed -E for extended regex) + numFound=$(echo $solr_result | sed -E 's/\s+http_code=[0-9]+//' | xmllint --format - | grep numFound | sed -E 's/^.*numFound="([0-9]+)".*$/\1/') + + if [[ numFound -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo "Purging $numFound hits from $ip in $STATISTICS_SHARD" + + # Purge the hits and soft commit + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "ip:/$ip/" > /dev/null 2>&1 + else + echo "Found $numFound hits from $ip in $STATISTICS_SHARD" + fi + + BOT_HITS=$((BOT_HITS+numFound)) + fi +done + +if [[ $BOT_HITS -gt 0 ]]; then + if [[ $PURGE_SPIDER_HITS ]]; then + echo + echo "Total number of bot hits purged: $BOT_HITS" + + # Hard commit after we're done processing all spiders + curl -s "$SOLR_URL/$STATISTICS_SHARD/update?commit=true" > /dev/null 2>&1 + else + echo + echo "Total number of hits from bots: $BOT_HITS" + fi +fi + +# vim: set expandtab:ts=4:sw=4:bs=2 diff --git a/ilri/check_duplicates.py b/ilri/check_duplicates.py new file mode 100755 index 000000000000..ed5c1953e99b --- /dev/null +++ b/ilri/check_duplicates.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +# check-duplicates.py 0.4.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a CSV with at least four columns containing id, item titles, types,and +# issue dates to be checked against the DSpace PostgreSQL database for potential +# duplicates. The database must have the trgm extention created in order for +# this to work: +# +# localhost/database= > CREATE EXTENSION pg_trgm; +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install psycopg colorama +# +# See: https://www.psycopg.org/psycopg3/docs + +import argparse +import csv +import signal +import sys +from datetime import datetime + +import util +from colorama import Fore +from psycopg import sql + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Compare the item's date issued to that of the potential duplicate +def compare_date_strings(item_date, duplicate_date): + # Split the item date on "-" to see what format we need to + # use to create the datetime object. + if len(item_date.split("-")) == 1: + date1 = datetime.strptime(item_date, "%Y") + elif len(item_date.split("-")) == 2: + date1 = datetime.strptime(item_date, "%Y-%m") + elif len(item_date.split("-")) == 3: + date1 = datetime.strptime(item_date, "%Y-%m-%d") + + # Do the same for the potential duplicate's date + if len(duplicate_date.split("-")) == 1: + date2 = datetime.strptime(duplicate_date, "%Y") + elif len(duplicate_date.split("-")) == 2: + date2 = datetime.strptime(duplicate_date, "%Y-%m") + elif len(duplicate_date.split("-")) == 3: + date2 = datetime.strptime(duplicate_date, "%Y-%m-%d") + + # Return the difference between the two dates. Doesn't matter which comes + # first here because we are getting the absolute to avoid negative days! + return abs((date1 - date2).days) + + +parser = argparse.ArgumentParser(description="Find duplicate titles.") +parser.add_argument( + "-i", + "--input-file", + help="Path to input CSV file.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "--days-threshold", + type=float, + help="Threshold for difference of days between item and potential duplicates (default 365).", + default=365, +) +parser.add_argument( + "-o", + "--output-file", + help="Path to output CSV file.", + required=True, + type=argparse.FileType("w"), +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument( + "-s", + "--similarity-threshold", + type=float, + help="Similarity threshold, between 0.0 and 1.0 (default 0.6).", + default=0.6, +) +args = parser.parse_args() + +# Column names in the CSV +id_column_name = "id" +criteria1_column_name = "dc.title" +criteria2_column_name = "dcterms.type" +criteria3_column_name = "dcterms.issued" + +# open the CSV +reader = csv.DictReader(args.input_file) + +# check if the title column exists in the CSV +if criteria1_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria one column "{criteria1_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the type column exists in the CSV +if criteria2_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria two column "{criteria2_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the date issued column exists in the CSV +if criteria3_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria three column "{criteria3_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) + + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +# set the connection to read only since we are not writing anything +conn.read_only = True + +cursor = conn.cursor() + +# Field IDs from the metadatafieldregistry table +criteria1_field_id = util.field_name_to_field_id(cursor, criteria1_column_name) +criteria2_field_id = util.field_name_to_field_id(cursor, criteria2_column_name) +criteria3_field_id = util.field_name_to_field_id(cursor, criteria3_column_name) + +with conn: + # Make sure the pg_trgm extension is installed in the current database + cursor.execute("SELECT extname FROM pg_extension WHERE extname='pg_trgm'") + if cursor.rowcount == 0: + sys.stderr.write( + Fore.RED + + f"Database '{args.database_name}' is missing the 'pg_trgm' extension.\n" + + Fore.RESET + ) + sys.exit(1) + + # Set the similarity threshold for this session. PostgreSQL default is 0.3, + # which leads to lots of false positives for this use case. Note that the + # weird syntax here is because of SET not working in in psycopg3. + # + # See: https://www.psycopg.org/psycopg3/docs/basic/from_pg2.html#server-side-binding + cursor.execute( + sql.SQL( + "SET pg_trgm.similarity_threshold = {}".format(args.similarity_threshold) + ) + ) + + # Fields for the output CSV + fieldnames = [ + "id", + "Your Title", + "Their Title", + "Similarity", + "Your Date", + "Their Date", + "Handle", + ] + + # Write the CSV header + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for input_row in reader: + # Check for items with similarity to criteria one (title). Note that + # this is the fastest variation of this query: using the similarity + # operator (%, written below twice for escaping) instead of the sim- + # larity function, as indexes are bound to operators, not functions! + # Also, if I leave off the item query it takes twice as long! + sql = "SELECT text_value, dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value %% %s" + + cursor.execute( + sql, + ( + criteria1_field_id, + input_row[criteria1_column_name], + ), + ) + + # If we have any similarity in criteria one (title), then check type + if cursor.rowcount > 0: + duplicate_titles = cursor.fetchall() + + # Iterate over duplicate titles to check their types + for duplicate_title in duplicate_titles: + dspace_object_id = duplicate_title[1] + + # Check type of this duplicate title, also making sure that + # the item is in the archive and not withdrawn. + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s AND M.text_value=%s AND I.in_archive='t' AND I.withdrawn='f'" + + cursor.execute( + sql, + ( + dspace_object_id, + criteria2_field_id, + input_row[criteria2_column_name], + ), + ) + + # This means we didn't match on item type, so let's skip to + # the next item title. + if cursor.rowcount == 0: + continue + + # Get the date of this potential duplicate. (If we are here + # then we already confirmed above that the item is both in + # the archive and not withdrawn, so we don't need to check + # that again). + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s" + + cursor.execute( + sql, + (dspace_object_id, criteria3_field_id), + ) + + # This means that we successfully extracted the date for the + # potential duplicate. + if cursor.rowcount > 0: + duplicate_item_date = cursor.fetchone()[0] + # If rowcount is not > 0 then the potential duplicate does + # not have a date and we have bigger problems. Skip! + else: + continue + + # Get the number of days between the issue dates + days_difference = compare_date_strings( + input_row[criteria3_column_name], duplicate_item_date + ) + + # Items with a similar title, same type, and issue dates + # within a year or so are likely duplicates. Otherwise, + # it's possible that items with a similar name could be + # like Annual Reports where most metadata is the same + # except the date issued. + if days_difference <= args.days_threshold: + # By this point if we have any matches then they are + # similar in title and have an exact match for the type + # and an issue date within the threshold. Now we are + # reasonably sure it's a duplicate, so get the handle. + sql = "SELECT handle FROM handle WHERE resource_id=%s" + cursor.execute(sql, (dspace_object_id,)) + try: + handle = f"https://hdl.handle.net/{cursor.fetchone()[0]}" + except TypeError: + # If we get here then there is no handle for this + # item's UUID. Could be that the item was deleted? + continue + + sys.stdout.write( + f"{Fore.YELLOW}Found potential duplicate:{Fore.RESET}\n" + ) + + # https://alexklibisz.com/2022/02/18/optimizing-postgres-trigram-search.html + sql = "SELECT round(similarity(%s, %s)::numeric, 3)" + cursor.execute( + sql, (input_row[criteria1_column_name], duplicate_title[0]) + ) + trgm_similarity = cursor.fetchone()[0] + + sys.stdout.write( + f"{Fore.YELLOW}→ Title:{Fore.RESET} {input_row[criteria1_column_name]} ({trgm_similarity})\n" + ) + sys.stdout.write(f"{Fore.YELLOW}→ Handle:{Fore.RESET} {handle}\n\n") + + output_row = { + "id": input_row[id_column_name], + "Your Title": input_row[criteria1_column_name], + "Their Title": duplicate_title[0], + "Similarity": trgm_similarity, + "Your Date": input_row[criteria3_column_name], + "Their Date": duplicate_item_date, + "Handle": handle, + } + + writer.writerow(output_row) + + # close output file before we exit + args.output_file.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/check_duplicates_fuzzy.py b/ilri/check_duplicates_fuzzy.py new file mode 100755 index 000000000000..afcfd879e382 --- /dev/null +++ b/ilri/check_duplicates_fuzzy.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 + +# check-duplicates.py 0.4.0 +# +# Copyright Alan Orth. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# --- +# +# Expects a CSV with at least four columns containing id, item titles, types,and +# issue dates to be checked against the DSpace PostgreSQL database for potential +# duplicates. The database must have the trgm extention created in order for +# this to work: +# +# localhost/database= > CREATE EXTENSION pg_trgm; +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install psycopg2-binary colorama +# +# See: http://initd.org/psycopg +# See: http://initd.org/psycopg/docs/usage.html#with-statement +# See: http://initd.org/psycopg/docs/faq.html#best-practices + +import argparse +import csv +import signal +import sys +from datetime import datetime + +import psycopg2 +from colorama import Fore + +# Column names in the CSV +id_column_name = "id" +criteria1_column_name = "dc.title" +criteria2_column_name = "dcterms.type" +criteria3_column_name = "dcterms.issued" +# Field IDs from the metadatafieldregistry table +criteria1_field_id = 64 +criteria2_field_id = 191 +criteria3_field_id = 170 + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Compare the item's date issued to that of the potential duplicate +def compare_date_strings(item_date, duplicate_date): + # Split the item date on "-" to see what format we need to + # use to create the datetime object. + if len(item_date.split("-")) == 1: + date1 = datetime.strptime(item_date, "%Y") + elif len(item_date.split("-")) == 2: + date1 = datetime.strptime(item_date, "%Y-%m") + elif len(item_date.split("-")) == 3: + date1 = datetime.strptime(item_date, "%Y-%m-%d") + + # Do the same for the potential duplicate's date + if len(duplicate_date.split("-")) == 1: + date2 = datetime.strptime(duplicate_date, "%Y") + elif len(duplicate_date.split("-")) == 2: + date2 = datetime.strptime(duplicate_date, "%Y-%m") + elif len(duplicate_date.split("-")) == 3: + date2 = datetime.strptime(duplicate_date, "%Y-%m-%d") + + # Return the difference between the two dates. Doesn't matter which comes + # first here because we are getting the absolute to avoid negative days! + return abs((date1 - date2).days) + + +parser = argparse.ArgumentParser(description="Find duplicate titles.") +parser.add_argument( + "-i", + "--input-file", + help="Path to input CSV file.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "--days-threshold", + type=float, + help="Threshold for difference of days between item and potential duplicates (default 365).", + default=365, +) +parser.add_argument( + "-o", + "--output-file", + help="Path to output CSV file.", + required=True, + type=argparse.FileType("w"), +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument( + "-s", + "--similarity-threshold", + type=float, + help="Similarity threshold, between 0.0 and 1.0 (default 0.6).", + default=0.6, +) +args = parser.parse_args() + +# open the CSV +reader = csv.DictReader(args.input_file) + +# check if the title column exists in the CSV +if criteria1_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria one column "{criteria1_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the type column exists in the CSV +if criteria2_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria two column "{criteria2_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +# check if the date issued column exists in the CSV +if criteria3_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'Specified criteria three column "{criteria3_column_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) + + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +try: + conn = psycopg2.connect( + f"dbname={args.database_name} user={args.database_user} password={args.database_pass} host=localhost" + ) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Connected to database.\n" + Fore.RESET) +except psycopg2.OperationalError: + sys.stderr.write(Fore.RED + "Could not connect to database.\n" + Fore.RESET) + sys.exit(1) + +with conn: + # cursor will be closed after this block exits + # see: http://initd.org/psycopg/docs/usage.html#with-statement + with conn.cursor() as cursor: + # Make sure the pg_trgm extension is installed in the current database + cursor.execute("SELECT extname FROM pg_extension WHERE extname='pg_trgm'") + if cursor.rowcount == 0: + sys.stderr.write( + Fore.RED + + f"Database '{args.database_name}' is missing the 'pg_trgm' extension.\n" + + Fore.RESET + ) + sys.exit(1) + + # Set the similarity threshold for this session. PostgreSQL default is + # 0.3, which leads to lots of false positives for this use case. + cursor.execute( + "SET pg_trgm.similarity_threshold = %s", (args.similarity_threshold,) + ) + + # Fields for the output CSV + fieldnames = [ + "id", + "Your Title", + "Their Title", + "Your Date", + "Their Date", + "Handle", + ] + + # Write the CSV header + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for input_row in reader: + # Check for items with similarity to criteria one (title). Note that + # this is the fastest variation of this query: using the similarity + # operator (%, written below twice for escaping) instead of the sim- + # larity function, as indexes are bound to operators, not functions! + # Also, if I leave off the item query it takes twice as long! + sql = "SELECT text_value, dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=%s AND LEVENSHTEIN_LESS_EQUAL(LOWER(%s), LEFT(LOWER(text_value), 255), 3) <= 3" + + cursor.execute( + sql, + ( + criteria1_field_id, + input_row[criteria1_column_name], + ), + ) + + # If we have any similarity in criteria one (title), then check type + if cursor.rowcount > 0: + duplicate_titles = cursor.fetchall() + + # Iterate over duplicate titles to check their types + for duplicate_title in duplicate_titles: + dspace_object_id = duplicate_title[1] + + # Check type of this duplicate title, also making sure that + # the item is in the archive and not withdrawn. + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s AND M.text_value=%s AND I.in_archive='t' AND I.withdrawn='f'" + + cursor.execute( + sql, + ( + dspace_object_id, + criteria2_field_id, + input_row[criteria2_column_name], + ), + ) + + # This means we didn't match on item type, so let's skip to + # the next item title. + if cursor.rowcount == 0: + continue + + # Get the date of this potential duplicate. (If we are here + # then we already confirmed above that the item is both in + # the archive and not withdrawn, so we don't need to check + # that again). + sql = "SELECT text_value FROM metadatavalue M JOIN item I ON M.dspace_object_id = I.uuid WHERE M.dspace_object_id=%s AND M.metadata_field_id=%s" + + cursor.execute( + sql, + (dspace_object_id, criteria3_field_id), + ) + + # This means that we successfully extracted the date for the + # potential duplicate. + if cursor.rowcount > 0: + duplicate_item_date = cursor.fetchone()[0] + # If rowcount is not > 0 then the potential duplicate does + # not have a date and we have bigger problems. Skip! + else: + continue + + # Get the number of days between the issue dates + days_difference = compare_date_strings( + input_row[criteria3_column_name], duplicate_item_date + ) + + # Items with a similar title, same type, and issue dates + # within a year or so are likely duplicates. Otherwise, + # it's possible that items with a similar name could be + # like Annual Reports where most metadata is the same + # except the date issued. + if days_difference <= args.days_threshold: + # By this point if we have any matches then they are + # similar in title and have an exact match for the type + # and an issue date within the threshold. Now we are + # reasonably sure it's a duplicate, so get the handle. + sql = "SELECT handle FROM handle WHERE resource_id=%s" + cursor.execute(sql, (dspace_object_id,)) + handle = f"https://hdl.handle.net/{cursor.fetchone()[0]}" + + sys.stdout.write( + f"{Fore.YELLOW}Found potential duplicate:{Fore.RESET}\n" + ) + sys.stdout.write( + f"{Fore.YELLOW}→ Title:{Fore.RESET} {input_row[criteria1_column_name]}\n" + ) + sys.stdout.write( + f"{Fore.YELLOW}→ Handle:{Fore.RESET} {handle}\n\n" + ) + + output_row = { + "id": input_row[id_column_name], + "Your Title": input_row[criteria1_column_name], + "Their Title": duplicate_title[0], + "Your Date": input_row[criteria3_column_name], + "Their Date": duplicate_item_date, + "Handle": handle, + } + + writer.writerow(output_row) + + # close output file before we exit + args.output_file.close() + + +# close database connection before we exit +conn.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/countries_to_csv.py b/ilri/countries_to_csv.py new file mode 100755 index 000000000000..485f88f82d02 --- /dev/null +++ b/ilri/countries_to_csv.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# +# countries-to-csv.py v0.0.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Read a list of countries and export a CSV with their ISO 3166-1 Alpha-2 codes +# and names. Run like this: +# +# $ countries-to-csv.py input-file.txt output-file.csv +# +# Expects input file to have one country per line. Extract countries from the +# DSpace input-forms.xml with xmllint: +# +# $ xmllint --xpath '//value-pairs[@value-pairs-name="countrylist"]/pair/stored-value/node()' dspace/config/input-forms.xml > /tmp/cgspace-countries.txt + +import csv +import sys + +import pycountry + +try: + # Quick handling of command line args, no time to implement argparse. + input_filename = sys.argv[1] + output_filename = sys.argv[2] +except IndexError: + print("Please specify input and output files.") + + exit(1) + +with open(input_filename, "r") as countries_in: + with open(output_filename, mode="w") as countries_out: + # Prepare the CSV + fieldnames = ["alpha2", "Name"] + csv_writer = csv.DictWriter(countries_out, fieldnames=fieldnames) + csv_writer.writeheader() + + for line in countries_in.readlines(): + print(f"Looking up {line.strip()}...") + + country_result = pycountry.countries.get(name=line.strip()) + + # Check if we found an exact match first + if country_result is not None: + country_alpha2 = country_result.alpha_2 + country_name = line.strip() + else: + # Can't find a match so just save the name with no alpha2. Note + # that we could try with a fuzzy search before giving up, but I + # have had some strange issues with fuzzy search in the past. + # + # See: https://github.com/flyingcircusio/pycountry/issues/115 + country_alpha2 = "" + country_name = line.strip() + + csv_writer.writerow({"alpha2": country_alpha2, "Name": country_name}) diff --git a/ilri/create-value-pairs.sh b/ilri/create-value-pairs.sh new file mode 100755 index 000000000000..ae70dd057926 --- /dev/null +++ b/ilri/create-value-pairs.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# +# ./create-value-pairs.sh terms.txt terms-name + +printf '\n' $2 + +while read -r line +do + printf ' \n' + printf ' %s\n' "$line" + printf ' %s\n' "$line" + printf ' \n' +done < $1 + +printf '' diff --git a/ilri/crossref_doi_lookup.py b/ilri/crossref_doi_lookup.py new file mode 100755 index 000000000000..b11a67af0c5a --- /dev/null +++ b/ilri/crossref_doi_lookup.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +# +# crossref-doi-lookup.py 0.2.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Crossref API for DOIs read from a text file (one per line). +# The Crossref database has a wealth of information about DOIs, for example the +# issue date, license, journal title, item type, authors, funders, etc. This +# information can be used to improve metadata in other systems. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import logging +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +import util +from colorama import Fore + +# Create a local logger instance for this module. We don't do any configuration +# because this module might be used elsewhere that will have its own logging +# configuration. +logger = logging.getLogger(__name__) + + +# Crossref uses dates with single-digit month and day parts, so we need to pad +# them with zeros if they are less than 10. +def fix_crossref_date(crossref_date: list) -> str: + if len(crossref_date) == 1: + issued = crossref_date[0] + elif len(crossref_date) == 2: + if crossref_date[1] < 10: + crossref_date_month = f"0{crossref_date[1]}" + else: + crossref_date_month = crossref_date[1] + + issued = f"{crossref_date[0]}-{crossref_date_month}" + elif len(crossref_date) == 3: + if crossref_date[1] < 10: + crossref_date_month = f"0{crossref_date[1]}" + else: + crossref_date_month = crossref_date[1] + + if crossref_date[2] < 10: + crossref_date_day = f"0{crossref_date[2]}" + else: + crossref_date_day = crossref_date[2] + + issued = f"{crossref_date[0]}-{crossref_date_month}-{crossref_date_day}" + else: + issued = "" + + return issued + + +def resolve_doi(doi: str) -> None: + logger.info(Fore.GREEN + f"Looking up DOI: {doi}" + Fore.RESET) + + # First, check if this DOI is registered at Crossref + request_url = f"https://api.crossref.org/works/{doi}/agency" + request_params = {"mailto": args.email} + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + logger.error(Fore.RED + "Connection error." + Fore.RESET) + + sys.exit(1) + + # HTTP 404 here means the DOI is not registered at Crossref + if not request.ok: + logger.debug( + Fore.YELLOW + + f"> DOI not in Crossref (cached: {request.from_cache})" + + Fore.RESET + ) + + return + + data = request.json() + + # Only proceed if this DOI registration agency is Crossref + match data["message"]["agency"]["label"]: + case "DataCite": + logger.debug( + Fore.YELLOW + + f"> Skipping DOI registered to DataCite (cached: {request.from_cache})" + + Fore.RESET + ) + + return + case "Public": + logger.debug( + Fore.YELLOW + + f'> Skipping DOI registered to "Public" (cached: {request.from_cache})' + + Fore.RESET + ) + + return + case "Crossref": + pass + + # Fetch the metadata for this DOI + request_url = f"https://api.crossref.org/works/{doi}" + request_params = {"mailto": args.email} + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + logger.error(Fore.RED + "Connection error." + Fore.RESET) + + if not request.ok: + return + + logger.debug( + Fore.YELLOW + f"> DOI in Crossref (cached: {request.from_cache})" + Fore.RESET + ) + + data = request.json() + + # I don't know why title is an array of strings, but let's just get + # the first one. + try: + title = data["message"]["title"][0] + except IndexError: + title = "" + + # Create an empty list to keep our authors + authors = list() + affiliations = list() + + try: + for author in data["message"]["author"]: + # Some authors have no given name in Crossref + try: + # Crossref given name is often initials like "S. M." + # and we don't want that space! + author_given_name = author["given"].replace(". ", ".") + except KeyError: + author_given_name = None + + # Some authors have no family name in Crossref + try: + author_family_name = author["family"] + except KeyError: + author_family_name = None + + # Naive construction of "Last, First Initials" when we have + # both of them. + if author_family_name and author_given_name: + authors.append(f"{author_family_name}, {author_given_name}") + # Otherwise we need to make do with only the family name + elif author_family_name and author_given_name is None: + authors.append(author_family_name) + # And sometimes we need to make do with only the given name + elif author_given_name and author_family_name is None: + authors.append(author_given_name) + + # Get any affiliations from the authors (not all have) + try: + for affiliation in author["affiliation"]: + if affiliation["name"] not in affiliations: + affiliations.append(affiliation["name"]) + # Not sure what we can except here + except: + pass + + # Believe it or not some items on Crossref have no author (doesn't + # mean the DOI itself won't, though). + # + # See: https://api.crossref.org/works/10.1638/2018-0110 + # See: https://doi.org/10.1638/2018-0110 + except KeyError: + authors = "" + + # Create an empty list to keep our funders + funders = list() + + try: + for funder in data["message"]["funder"]: + if funder["name"] not in funders: + funders.append(funder["name"]) + except KeyError: + pass + + # Get the abstract if it exists + try: + abstract = data["message"]["abstract"] + except KeyError: + abstract = "" + + try: + journal = data["message"]["container-title"][0] + except IndexError: + journal = "" + + # Create an empty list to hold ISSNs, as there could be more than one + issns = list() + + # Get the ISSN. For journal articles there is often a print ISSN and + # an electric ISSN. + try: + for issn in data["message"]["ISSN"]: + issns.append(issn) + except KeyError: + issns = "" + + # Create an empty list to hold ISBNs, as there could be more than one + isbns = list() + + # Get the ISBN. For books and book chapters there is often a print + # ISBN and an electric ISBN. + try: + for isbn in data["message"]["isbn-type"]: + isbns.append(isbn["value"]) + except KeyError: + isbns = "" + + try: + publisher = data["message"]["publisher"] + except KeyError: + publisher = "" + + try: + volume = data["message"]["volume"] + except KeyError: + volume = "" + + try: + issue = data["message"]["issue"] + except KeyError: + issue = "" + + try: + page = data["message"]["page"] + except KeyError: + page = "" + + try: + item_type = data["message"]["type"] + except KeyError: + item_type = "" + + subjects = list() + + # Get the subjects. Still not sure if these are useful. We should + # check against AGROVOC before importing. + try: + for subject in data["message"]["subject"]: + subjects.append(subject) + except KeyError: + subjects = "" + + # It appears that *all* DOIs on Crossref have an "issued" date. This + # is the earliest of the print and online publishing dates. For now + # I will capture this so I can explore its implications and relation + # to other dates with real items in the repository. + # + # See: https://github.com/CrossRef/rest-api-doc/blob/master/api_format.md + issued = fix_crossref_date(data["message"]["issued"]["date-parts"][0]) + + # Date on which the work was published in print. Apparently not all + # DOIs have this so we need to try/except. Also note that there is + # a similar date in ["journal-issue"]["published-print"], but in my + # experience it is the same as this one 99% of the time when it is + # present (that's in 10,000 DOIs I checked in 2023-02). + try: + published_print = fix_crossref_date( + data["message"]["published-print"]["date-parts"][0] + ) + except KeyError: + published_print = "" + + # Date on which the work was published online. Note again that there + # is also ["journal-issue"]["published-online"], but in my experience + # it is only present ~33% of the time, and is only 50% the same as + # published-online. For now I'm not sure what to make of that, so I + # will not use it. + try: + published_online = fix_crossref_date( + data["message"]["published-online"]["date-parts"][0] + ) + except KeyError: + published_online = "" + + # Not all items have licenses, and some have multiple licenses. We + # will check for licenses in the order we prefer them: am, vor, tdm, + # and unspecified. These correspond to: accepted manuscript, version + # of record, text and data mining, and unspecified. I'm curious if + # there is *ever* a case where we would want the tdm license...? Can + # these ever be CC if the others are missing? + doi_licenses = {} + try: + for doi_license in data["message"]["license"]: + content_version = doi_license["content-version"] + doi_licenses[content_version] = doi_license["URL"] + + if "am" in doi_licenses: + license_url = f'am: {doi_licenses["am"]}' + elif "vor" in doi_licenses: + license_url = f'vor: {doi_licenses["vor"]}' + elif "tdm" in doi_licenses: + license_url = f'tdm: {doi_licenses["tdm"]}' + else: + license_url = f'unspecified: {doi_licenses["unspecified"]}' + except KeyError: + license_url = "" + + writer.writerow( + { + "title": title, + "abstract": abstract, + "authors": "||".join(authors), + "affiliations": "||".join(affiliations), + "funders": "||".join(funders), + "doi": f"https://doi.org/{doi}", + "journal": journal, + "issn": "||".join(issns), + "isbn": "||".join(isbns), + "publisher": publisher, + "volume": volume, + "issue": issue, + "page": page, + "type": item_type, + "issued": issued, + "published_print": published_print, + "published_online": published_online, + "license": license_url, + "subjects": "||".join(subjects), + } + ) + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API for metadata about DOIs." +) +parser.add_argument( + "-e", + "--email", + required=True, + help="Contact email to use in API requests so Crossref is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing DOIs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Since we're running interactively we can set the preferred log format for +# the logging module during this invocation. +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# install a transparent requests cache +expire_after = timedelta(days=30) +requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) +) +# prune old cache entries +requests_cache.delete() + +# Write the CSV header before starting +if args.output_file: + fieldnames = [ + "title", + "abstract", + "authors", + "affiliations", + "funders", + "doi", + "journal", + "issn", + "isbn", + "publisher", + "volume", + "issue", + "page", + "type", + "issued", + "published_print", + "published_online", + "license", + "subjects", + ] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + +# if the user specified an input file, get the DOIs from there +if args.input_file: + dois = util.read_dois_from_file(args.input_file) + for doi in dois: + resolve_doi(doi) + +# close output file before we exit +args.output_file.close() diff --git a/ilri/crossref_funders_lookup.py b/ilri/crossref_funders_lookup.py new file mode 100755 index 000000000000..d58162386bf4 --- /dev/null +++ b/ilri/crossref_funders_lookup.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# +# crossref-funders-lookup.py 0.3.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Crossref API for funders read from a text file. Text file +# should have one subject per line. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read funders from a text file, one per line +def read_funders_from_file(): + # initialize an empty list for funders + funders = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add subjects that aren't already present + if line not in funders: + funders.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_funders(funders) + + +def resolve_funders(funders): + fieldnames = ["funder", "match type", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with two weeks expiry because I don't + # know how often Crossref is updated. + expire_after = timedelta(days=14) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for funder in funders: + if args.debug: + sys.stderr.write(Fore.GREEN + f"Looking up funder: {funder}\n" + Fore.RESET) + + request_url = "https://api.crossref.org/funders" + request_params = {"query": funder} + + if args.email: + request_params.update(mailto=args.email) + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + sys.stderr.write(Fore.RED + "Connection error.\n" + Fore.RESET) + + if request.status_code == requests.codes.ok: + data = request.json() + + # assume no matches yet + matched = False + + # check if there are any results + if data["message"]["total-results"] > 0: + # iterate over each search result (item) + for item in data["message"]["items"]: + if item["name"].lower() == funder.lower() and not matched: + matched = True + + print( + f"Exact match for {funder} in Crossref (cached: {request.from_cache})" + ) + + writer.writerow( + { + "funder": funder, + "match type": "name", + "matched": "true", + } + ) + + # break out of the items loop because we have a match + break + + # check the alt-names for each search result + for altname in item["alt-names"]: + if altname.lower() == funder.lower() and not matched: + matched = True + + print( + f"Alt-name match for {funder} in Crossref (cached: {request.from_cache})" + ) + + writer.writerow( + { + "funder": funder, + "match type": "alt-name", + "matched": "true", + } + ) + + # break out of the alt-name loop because we have a match + break + + if data["message"]["total-results"] == 0 or not matched: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {funder} in Crossref (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow( + { + "funder": funder, + "match type": "", + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API to validate funders from a text file." +) +parser.add_argument( + "-e", + "--email", + help="Contact email to use in API requests so Crossref is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing funders to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the funders from there +if args.input_file: + read_funders_from_file() + +exit() diff --git a/ilri/crossref_issn_lookup.py b/ilri/crossref_issn_lookup.py new file mode 100755 index 000000000000..e336dcbee5e9 --- /dev/null +++ b/ilri/crossref_issn_lookup.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# +# crossref-issn-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Crossref API for journal titles using ISSNs read from a +# text file. The text file should have one ISSN per line. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read journals from a text file, one per line +def read_issns_from_file(): + # initialize an empty list for ISSNs + issns = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add ISSNs that aren't already present + if line not in issns: + issns.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_issns(issns) + + +def resolve_issns(issns): + fieldnames = ["issn", "journal title"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with two weeks expiry because I don't + # know how often Crossref is updated. + expire_after = timedelta(days=14) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for issn in issns: + if args.debug: + sys.stderr.write(Fore.GREEN + f"Looking up ISSN: {issn}\n" + Fore.RESET) + + request_url = f"https://api.crossref.org/journals/{issn}" + + try: + if args.email: + request_params = {"mailto": args.email} + + request = requests.get(request_url, params=request_params) + else: + request = requests.get(request_url, params=request_params) + + except requests.exceptions.ConnectionError: + sys.stderr.write(Fore.RED + "Connection error.\n" + Fore.RESET) + + # CrossRef responds 404 if a journal isn't found, so we check for an + # HTTP 2xx response here + if request.status_code == requests.codes.ok: + data = request.json() + + # sanity check if our ISSN is in CrossRef's response (do we + # need to check lowercase here?) + if issn in data["message"]["ISSN"]: + print( + f"Exact match for {issn} in Crossref (cached: {request.from_cache})" + ) + + writer.writerow( + {"issn": issn, "journal title": data["message"]["title"]} + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {issn} in Crossref (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow({"issn": issn, "journal title": ""}) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API to validate ISSNs from a text file." +) +parser.add_argument( + "-e", + "--email", + help="Contact email to use in API requests so Crossref is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing ISSNs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the ISSNs from there +if args.input_file: + read_issns_from_file() + +exit() diff --git a/ilri/delete_metadata_values.py b/ilri/delete_metadata_values.py new file mode 100755 index 000000000000..6be47b093b70 --- /dev/null +++ b/ilri/delete_metadata_values.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# +# delete-metadata-values.py 1.2.5 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a CSV with one column of metadata values to delete, for example: +# +# cg.contributor.affiliation +# "some value to delete" +# +# $ ./delete-metadata-values.py -db database -u user -p password -f cg.contributor.affiliation -i file.csv +# +# This script is written for Python 3 and DSpace 6+ and requires several modules +# that you can install with pip (I recommend setting up a Python virtual env +# first): +# +# $ pip install psycopg colorama +# + +import argparse +import csv +import signal +import sys + +import util +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(0) + + +parser = argparse.ArgumentParser( + description="Delete metadata values in the DSpace SQL database." +) +parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", +) +parser.add_argument( + "-f", + "--from-field-name", + help="Name of column with values to be deleted", + required=True, +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +args = parser.parse_args() + +# open the CSV +reader = csv.DictReader(args.csv_file) + +# check if the from/to fields specified by the user exist in the CSV +if args.from_field_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + 'Specified field "{0}" does not exist in the CSV.\n'.format( + args.from_field_name + ) + + Fore.RESET + ) + sys.exit(1) + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +if args.dry_run: + conn.read_only = True + +cursor = conn.cursor() + +for row in reader: + metadata_field_id = util.field_name_to_field_id(cursor, args.from_field_name) + + # Get item UUIDs for metadata values that will be updated + sql = "SELECT dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute(sql, (metadata_field_id, row[args.from_field_name])) + + if cursor.rowcount > 0: + if args.dry_run: + if not args.quiet: + print( + Fore.GREEN + + "Would delete {0} occurences of: {1}".format( + cursor.rowcount, row[args.from_field_name] + ) + + Fore.RESET + ) + + # Since this a dry run we can continue to the next replacement + continue + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + sql = "DELETE from metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute(sql, (metadata_field_id, row[args.from_field_name])) + + if cursor.rowcount > 0 and not args.quiet: + print( + Fore.GREEN + + "Deleted {0} occurences of: {1}".format( + cursor.rowcount, row[args.from_field_name] + ) + + Fore.RESET + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[0]) + + +# commit the changes when we are done +if not args.dry_run: + conn.commit() + +# close database connection before we exit +conn.close() + +# close the input file +args.csv_file.close() + +sys.exit(0) diff --git a/ilri/doi_to_handle.py b/ilri/doi_to_handle.py new file mode 100755 index 000000000000..89983df8f34d --- /dev/null +++ b/ilri/doi_to_handle.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# +# doi-to-handle.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# This script was written to produce a list of Handles from a list of DOIs. It +# reads a text file with DOIs (one per line) and looks in the local DSpace SQL +# database to find the Handle for any item with that DOI. We used it to target +# the Tweeting of certain items in order to get Altmetric to make the link be- +# tween the Handle and the DOI. +# +# This script is written for Python 3.6+. +# + +import argparse +import csv +import signal +import sys + +import util + + +def resolve_doi(dois): + # metadata_field_id for metadata values (from metadatafieldregistry and + # might differ from site to site). + title_metadata_field_id = 64 + handle_metadata_field_id = 25 + doi_metadata_field_id = 220 + + print(f"Looking up {doi} in database") + + cursor = conn.cursor() + + with conn.transaction(): + # make a temporary string we can use with the PostgreSQL regex + doi_string = f".*{doi}.*" + + # get the dspace_object_id for the item with this DOI + sql = "SELECT dspace_object_id FROM metadatavalue WHERE metadata_field_id=%s AND text_value ~* %s" + cursor.execute( + sql, + (doi_metadata_field_id, doi_string), + ) + + # make sure rowcount is exactly 1, because some DOIs are used + # multiple times and I ain't got time for that right now + if cursor.rowcount == 1 and not args.quiet: + dspace_object_id = cursor.fetchone()[0] + print(f"Found {doi}, DSpace object: {dspace_object_id}") + elif cursor.rowcount > 1 and not args.quiet: + print(f"Found multiple items for {doi}") + + return + else: + print(f"Not found: {doi}") + + return + + # get the title + sql = "SELECT text_value FROM metadatavalue WHERE metadata_field_id=%s AND dspace_object_id=%s" + cursor.execute(sql, (title_metadata_field_id, dspace_object_id)) + + if cursor.rowcount != 1: + print(f"Missing title for {doi}, skipping") + + return + + title = cursor.fetchone()[0] + + # get the handle + cursor.execute(sql, (handle_metadata_field_id, dspace_object_id)) + + if cursor.rowcount != 1: + print(f"Missing handle for {doi}, skipping") + + return + + handle = cursor.fetchone()[0] + + row = { + "title": title, + "handle": handle, + "doi": doi, + } + + writer.writerow(row) + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + # close database connection before we exit + conn.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query DSpace database for item metadata based on a list of DOIs in a text file." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument( + "-i", + "--input-file", + help="File name containing DOIs to resolve.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="File name to save CSV output.", + required=True, + type=argparse.FileType("w"), +) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +# Set this connection to be read only since we are not modifying the database +conn.read_only = True + +# field names for the CSV +fieldnames = ["title", "handle", "doi"] + +writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) +writer.writeheader() + +dois = util.read_dois_from_file(args.input_file) +for doi in dois: + resolve_doi(doi) + +# close output file before we exit +args.output_file.close() + +# close database connection before we exit +conn.close() + +exit() diff --git a/ilri/fix_initiative_mappings.py b/ilri/fix_initiative_mappings.py new file mode 100755 index 000000000000..c3cf7e9c1bc9 --- /dev/null +++ b/ilri/fix_initiative_mappings.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +# +# fix-initiative-mappings.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A script to help me fix collection mappings for items tagged with metadata +# for the 2030 Research Initiatives. It works by parsing the DSpace REST API +# to find collection names and handles, then checks existing items to see if +# their tagged Initiatives match their mapped collections. By default, the +# script will add missing mappings, but will not remove invalid ones (see the +# -r option). +# +# The script expects a CSV with item IDs, collections, and Initiatives, and +# outputs a CSV with updated collection mappings that you can import to DSpace +# using `dspace metadata-import -f file.csv`. +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install requests requests_cache colorama +# +# See: https://requests.readthedocs.org/en/master +# +# TODO: abstract some stuff so it's less messy + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +def parse_community(community_id): + request_url = ( + rest_base_url + + rest_communities_endpoint + + str(community_id) + + "?expand=collections" + ) + try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) + except requests.ConnectionError: + sys.stderr.write( + f"{Fore.RED}Could not connect to {args.rest_url}.{Fore.RESET}\n" + ) + exit(1) + + if request.status_code == requests.codes.ok: + collections = request.json()["collections"] + + # Initialize an empty dict of Initiative collections + initiative_collections = {} + + for collection in collections: + # We are only interested in Initiative collections + if initiative_column_name_prefix in collection["name"]: + initiative_collections.update( + {collection["name"]: collection["handle"]} + ) + else: + sys.stderr.write( + f"{Fore.RED}Status not OK! Request URL was: {request_url}{Fore.RESET}\n" + ) + exit(1) + + return initiative_collections + + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +parser = argparse.ArgumentParser( + description="Find all collections under a given DSpace community." +) +parser.add_argument("community", help="Community to process, for example: 10568/115087") +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-i", + "--input-file", + help="Path to input file (CSV)", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument( + "-o", + "--output-file", + help="Path to output file (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +parser.add_argument( + "-r", "--remove", help="Remove invalid mappings.", action="store_true" +) +parser.add_argument( + "-u", + "--rest-url", + help="URL of DSpace REST application.", + default="http://localhost:8080/rest", +) +args = parser.parse_args() + +handle = args.community + +# REST base URL and endpoints (with leading and trailing slashes) +rest_base_url = args.rest_url +rest_handle_endpoint = "/handle/" +rest_communities_endpoint = "/communities/" +rest_collections_endpoint = "/collections/" +rest_user_agent = "Alan Test Python Requests Bot" +initiatives_list_url = "https://ilri.github.io/cgspace-submission-guidelines/cg-contributor-initiative/cg-contributor-initiative.txt" + +# Column names in the CSV +id_column_name = "id" +collection_column_name = "collection" +initiative_column_name = "cg.contributor.initiative[en_US]" +# The prefix for all Initiative collection names +initiative_column_name_prefix = "CGIAR Initiative on " + +# Enable transparent request cache with one day expiry, as we are worried that +# Initiative names could have changed. +expire_after = timedelta(days=1) +requests_cache.install_cache("requests-cache", expire_after=expire_after) + +# Prune old cache entries +requests_cache.delete() + +# Fetch the controlled vocabulary for Initiatives +try: + request = requests.get( + initiatives_list_url, headers={"user-agent": rest_user_agent} + ) +except requests.ConnectionError: + sys.stderr.write( + f"{Fore.RED}Could not connect to REST API: {args.rest_url}.{Fore.RESET}\n" + ) + exit(1) + +# Convert the request test to a list so we can use it for lookups later +if request.status_code == requests.codes.ok: + initiatives_list = request.text.splitlines() + +# Fetch the metadata for the given community handle +request_url = rest_base_url + rest_handle_endpoint + str(handle) +try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) +except requests.ConnectionError: + sys.stderr.write( + f"{Fore.RED}Could not connect to REST API: {args.rest_url}.{Fore.RESET}\n" + ) + exit(1) + +# Check the request status +if request.status_code == requests.codes.ok: + handle_type = request.json()["type"] + + # Make sure the given handle is a community + if handle_type == "community": + community_id = request.json()["uuid"] + initiative_collections = parse_community(community_id) + else: + sys.stderr.write( + +f'{Fore.RED}{handle} is type "{handle_type}", not community.{Fore.RESET}\n' + ) + exit(1) +else: + sys.stderr.write( + f"{Fore.RED}Request failed. Are you sure {handle} is a valid handle?{Fore.RESET}\n" + ) + exit(1) + +# Open the input file +reader = csv.DictReader(args.input_file) + +# Check if the columns exist in the input file +if id_column_name not in reader.fieldnames: + sys.stderr.write( + f'{Fore.RED}Specified ID column "{id_column_name}" does not exist in the CSV.{Fore.RESET}' + ) + sys.exit(1) + +if collection_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'{Fore.RED}Specified collection column "{collection_column_name}" does not exist in the CSV.{Fore.RESET}' + ) + sys.exit(1) + +if initiative_column_name not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f'{Fore.RED}Specified Initiative column "{initiative_column_name}" does not exist in the CSV.{Fore.RESET}' + ) + sys.exit(1) + +# Fields for the output CSV +fieldnames = [ + id_column_name, + collection_column_name, +] + +# Write the CSV header +writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) +writer.writeheader() + +# Iterate over the input file to check each item's Initiatives and collections +for input_row in reader: + item_id = input_row[id_column_name] + # Get the item's current collections + item_collections = input_row[collection_column_name].split("||") + item_initiatives = input_row[initiative_column_name].split("||") + + # First, iterate over the item's Initiatives so we can see if it is mapped + # to appropriate collections. + for item_initiative in item_initiatives: + if item_initiative in initiatives_list: + # This is ugly because our Initiative metadata uses the short + # names, but the corresponding collection names are prefixed + # with "CGIAR Initiative on ". + correct_initiative_collection = initiative_collections[ + f"{initiative_column_name_prefix}{item_initiative}" + ] + + if correct_initiative_collection in item_collections: + if args.debug: + print( + f"{Fore.GREEN}(Phase 1) {item_id} is correctly mapped to Initiative collection: {correct_initiative_collection} ({item_initiative}){Fore.RESET}" + ) + else: + print( + f"{Fore.YELLOW}(Phase 1) {item_id} mapping to Initiative collection: {correct_initiative_collection} ({item_initiative}){Fore.RESET}" + ) + + # Add the collection + item_collections.append(correct_initiative_collection) + elif not item_initiative: + if args.debug: + sys.stderr.write( + f"{Fore.RED}(Phase 1) {item_id} has no Initiative metadata{Fore.RESET}\n" + ) + else: + sys.stderr.write( + f"{Fore.RED}(Phase 1) {item_id} has invalid Initiative: {item_initiative}{Fore.RESET}\n" + ) + + # Empty list to hold incorrectly mapped collections we find for this item + incorrectly_mapped_collections = [] + + # Second, iterate over the item's collections to see if each one has corre- + # sponding Initiative metadata. + for item_collection in item_collections: + # Is it an Initiatve collection? + if item_collection in initiative_collections.values(): + # Now check if this item is tagged with metadata for the corre- + # sponding Initative. We technically want to do a reverse look- + # up in the dict to find the key (initiative) for the current + # collection, but that's not possible. Instead iterate over the + # dict's keys/values and do some sanity checks. + for initiative, collection in initiative_collections.items(): + # If current item collection matches the current Initiative + # collection then we need to check if the Initiative name + # also matches the item's metadata + if item_collection == collection: + # Remember the collection names use the long Initiative name + initiative_short_name = initiative.replace( + initiative_column_name_prefix, "" + ) + + if initiative_short_name in item_initiatives: + if args.debug: + print( + f"{Fore.GREEN}(Phase 2) {item_id} is correctly mapped to Initiative collection: {collection} ({initiative_short_name}){Fore.RESET}" + ) + + continue + else: + if args.remove: + sys.stderr.write( + f"{Fore.YELLOW}(Phase 2) {item_id} unmapping from Initiative collection: {collection} ({initiative_short_name}){Fore.RESET}\n" + ) + + incorrectly_mapped_collections.append(collection) + else: + sys.stderr.write( + f"{Fore.RED}(Phase 2) {item_id} is incorrectly mapped to Initiative collection: {collection} ({initiative_short_name}){Fore.RESET}\n" + ) + + for incorrectly_mapped_collection in incorrectly_mapped_collections: + item_collections.remove(incorrectly_mapped_collection) + + # We only need to save the item to the output CSV if we have changed its + # mappings. Check the mutated item_collections list against the original + # from the input CSV. + if item_collections != input_row[collection_column_name].split("||"): + # We only need to write the IDs and collections to the output file since we + # are not modifying any other metadata in the CSV. + output_row = { + id_column_name: input_row[id_column_name], + collection_column_name: "||".join(item_collections), + } + + writer.writerow(output_row) + +# close CSV files before we exit +args.input_file.close() +args.output_file.close() + +sys.exit(0) diff --git a/ilri/fix_maxmind_stats.py b/ilri/fix_maxmind_stats.py new file mode 100755 index 000000000000..c27bc0f31f20 --- /dev/null +++ b/ilri/fix_maxmind_stats.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# +# fix_maxmind_stats.py v0.0.1 +# +# Fix DSpace statistics containing literal MaxMind city JSON objects, for +# example: +# +# - com.maxmind.geoip2.record.City [ {"geoname_id":3936456,"names":{"de":"Lima","ru":"Лима","pt-BR":"Lima","ja":"リマ","en":"Lima","fr":"Lima","es":"Lima"}} ] +# - com.maxmind.geoip2.record.City [ {} ] +# +# See: https://github.com/DSpace/DSpace/issues/9118 +# +# The input file is a multi-line JSON exported from a DSpace 6.x Solr statistics +# core using solr-import-export-json. I exported all statistics documents that +# were affected using the Solr query "city:com*". +# +# Notes: +# +# I tried to use json from the stdlib but it doesn't support multi-line JSON. +# I tried to use pandas read_json(), but it introduces a whole bunch of other +# issues with data types, missing values, etc. In the end it was much simpler +# to use the jsonlines package. + +import json +import os + +import jsonlines + + +def fix_city(value): + """Clean city string.""" + + # Remove some crap so this can be a dict + value = value.replace("com.maxmind.geoip2.record.City [ ", "") + value = value.replace(" ]", "") + + # Try to read the cleaned string as a dict and access the English name + try: + # Assuming all city objects have an English version + value = json.loads(value)["names"]["en"] + except KeyError: + value = "" + + return value + + +input_filename = "/home/aorth/Downloads/stats-maxmind-cities.json" +output_filename = "/home/aorth/Downloads/stats-maxmind-cities-fixed.json" + +if os.path.exists(output_filename): + os.remove(output_filename) + +# Open the JSON file and iterate over each line as an object +with jsonlines.open(input_filename) as reader: + for obj in reader: + # Remove cities that are empty objects + if obj["city"] == "com.maxmind.geoip2.record.City [ {} ]": + del obj["city"] + else: + obj["city"] = fix_city(obj["city"]) + + # Write each line back out (appending) + with jsonlines.open(output_filename, mode="a") as writer: + writer.write(obj) diff --git a/ilri/fix_metadata_values.py b/ilri/fix_metadata_values.py new file mode 100755 index 000000000000..0f765747279f --- /dev/null +++ b/ilri/fix_metadata_values.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +# +# fix-metadata-values.py v1.2.7 +# +# Copyright Alan Orth +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a CSV with two columns: one with "bad" metadata values and one with +# correct values. Basically just a mass search and replace function for DSpace's +# PostgreSQL database. This script only works on DSpace 6+. Make sure to do a +# full `index-discovery -b` afterwards. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install psycopg colorama +# +# See: https://www.psycopg.org/psycopg3/docs +# + +import argparse +import csv +import logging +import signal +import sys + +import util +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Find and replace metadata values in the DSpace SQL database." +) +parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", +) +parser.add_argument( + "-f", + "--from-field-name", + help="Name of column with values to be replaced.", + required=True, +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +parser.add_argument( + "-t", + "--to-field-name", + help="Name of column with values to replace.", + required=True, +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# open the CSV +reader = csv.DictReader(args.csv_file) + +# check if the from/to fields specified by the user exist in the CSV +if args.from_field_name not in reader.fieldnames: + logger.error( + Fore.RED + + f'Specified field "{args.from_field_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) +if args.to_field_name not in reader.fieldnames: + logger.error( + Fore.RED + + f'Specified field "{args.to_field_name}" does not exist in the CSV.' + + Fore.RESET + ) + sys.exit(1) + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +if args.dry_run: + conn.read_only = True + +cursor = conn.cursor() + +for row in reader: + if row[args.from_field_name] == row[args.to_field_name]: + # sometimes editors send me corrections with identical search/replace patterns + logger.debug( + Fore.YELLOW + + f"Skipping identical search and replace for value: {row[args.from_field_name]}" + + Fore.RESET + ) + + continue + + if "|" in row[args.to_field_name]: + # sometimes editors send me corrections with multi-value fields, which are supported in DSpace itself, but not here! + logger.debug( + Fore.YELLOW + + f"Skipping correction with invalid | character: {row[args.to_field_name]}" + + Fore.RESET + ) + + continue + + metadata_field_id = util.field_name_to_field_id(cursor, args.from_field_name) + + # Get item UUIDs for metadata values that will be updated + sql = "SELECT dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute(sql, (metadata_field_id, row[args.from_field_name])) + + if cursor.rowcount > 0: + if args.dry_run: + if not args.quiet: + logger.info( + Fore.GREEN + + f"(DRY RUN) Fixed {cursor.rowcount} occurences of: {row[args.from_field_name]}" + + Fore.RESET + ) + + # Since this a dry run we can continue to the next replacement + continue + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + sql = "UPDATE metadatavalue SET text_value=%s WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute( + sql, + ( + row[args.to_field_name], + metadata_field_id, + row[args.from_field_name], + ), + ) + + if cursor.rowcount > 0 and not args.quiet: + logger.info( + Fore.GREEN + + f"Fixed {cursor.rowcount} occurences of: {row[args.from_field_name]}" + + Fore.RESET + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[0]) + + +# commit changes after we are done +if not args.dry_run: + conn.commit() + +# close database connection before we exit +conn.close() + +# close input file +args.csv_file.close() + +sys.exit(0) diff --git a/ilri/generate_solr_statistics.py b/ilri/generate_solr_statistics.py new file mode 100755 index 000000000000..c3e69c222d66 --- /dev/null +++ b/ilri/generate_solr_statistics.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# +# generate_solr_statistics.py v0.0.1 +# +# Helper script to generate a bunch of Solr statistics based on a single +# reference statistic exported from a DSpace 6.3 Solr statistics core. +# +# The rationale for this was that we replaced a PDF bitstream and all +# downloads that had accumulated for the original PDF were deleted and +# the author wanted us to create the statistics again. According to the +# researcher, the item had ~3200 downloads from Mexico, Honduras, Brazil, +# Colombia, and Nicaragua before the PDF was deleted. + +import json +import os +import random +from datetime import datetime +from uuid import uuid4 + +import jsonlines + + +def random_datetime() -> datetime: + # When the item was uploaded to CGSpace + start_date = datetime.fromisoformat("2023-09-26T00:00:00Z") + # When the researcher last checked the statistics + end_date = datetime.fromisoformat("2023-10-20T00:00:00Z") + + dt = random.random() * (end_date - start_date) + start_date + + return dt + + +def random_city(country_code: str) -> str: + match country_code: + case "MX": + cities = [ + "Oaxaca", + "Juarez", + "Puebla", + "Mexico", + "Texmelucan", + "Cancún", + "Tultitlán", + "Minatitlán", + ] + case "HN": + cities = ["El Progreso", "Tegucigalpa", "San Pedro Sula", "La Ceiba"] + case "CO": + cities = [ + "Bogotá", + "Medellín", + "Cali", + "Jamundi", + "Barranquilla", + "Villavicencio", + ] + case "BR": + cities = [ + "Sao Luis", + "Rio De Janeiro", + "Guaira", + "Cruzeiro Do Sul", + "Santo Antonio De Jesus", + "Valinhos", + "Ituiutaba", + "Sobradinho", + "Maringa", + ] + case "NI": + cities = [ + "Chinandega", + "Managua", + "Masaya", + "San Juan Del Sur", + "Matagalpa", + "Estelí", + "León", + "Acoyapa", + ] + + return random.choice(cities) + + +def country_continent(country_code: str) -> str: + match country_code: + case "MX": + continent = "NA" + case "HN": + continent = "NA" + case "CO": + continent = "SA" + case "BR": + continent = "SA" + case "NI": + continent = "NA" + + return continent + + +# This is the reference statistic that we want to base our new +# statistics on. +# input_filename = "/home/aorth/Downloads/maria-no-atmire-schema.json" +input_filename = "/home/aorth/Downloads/maria.json" +output_filename = "/tmp/out.json" + +if os.path.exists(output_filename): + os.remove(output_filename) + +with open(input_filename, "r") as f: + json_data = json.load(f) + +# Check if this statistic has fields from the Atmire CUA schema +if "cua_version" in json_data: + atmire_cua = True +else: + atmire_cua = False + +# Delete some stuff that isn't required +del json_data["_version_"] # Solr adds this automatically on insert +# Too annoying to do for fake statistics, and not needed by any usage graphs +del json_data["ip"] +del json_data["dns"] +del json_data["latitude"] +del json_data["longitude"] + +# Don't think we need these. The *_ngram and *_search fields are custom Atmire +# modifications to the Solr schema that get copied from the relevant field on +# insert. +if atmire_cua: + del json_data["ip_ngram"] + del json_data["ip_search"] + del json_data["referrer_ngram"] + del json_data["referrer_search"] + del json_data["userAgent_ngram"] + del json_data["userAgent_search"] + del json_data["countryCode_ngram"] + del json_data["countryCode_search"] + +# Set a user agent. Hey it's me! +json_data[ + "userAgent" +] = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" + +# Open the output file. This is ghetto because we write each line individually +# in each loop iteration below. +with jsonlines.open(output_filename, mode="a") as writer: + for country_code in ["MX", "HN", "CO", "BR", "NI"]: + json_data["countryCode"] = country_code + if atmire_cua: + json_data["geoIpCountryCode"] = [country_code] + json_data["continent"] = country_continent(country_code) + + for x in range(640): + dt = random_datetime() + # Set a random time in our range + json_data["time"] = dt.strftime("%Y-%m-%dT%H:%M:%SZ") + if atmire_cua: + json_data["dateYear"] = dt.strftime("%Y") + json_data["dateYearMonth"] = dt.strftime("%Y-%m") + + # Set a random city from our list + json_data["city"] = random_city(country_code) + # Set a unique UUIDv4 (required in Solr stats schema) + json_data["uid"] = str(uuid4()) + + writer.write(json_data) diff --git a/ilri/generate_thumbnails.py b/ilri/generate_thumbnails.py new file mode 100755 index 000000000000..922d3f06967c --- /dev/null +++ b/ilri/generate_thumbnails.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# +# generate-thumbnails.py 1.1.4 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- + +# Reads the filename and URL fields from a CSV, fetches the PDF, and generates +# a thumbnail using pyvips (libvips must be installed on the host). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests pyvips +# +# See: https://requests.readthedocs.org/en/master + +import argparse +import csv +import os.path +import re +import signal +import sys + +import pyvips +import requests +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Process thumbnails from filename.pdf to filename.webp using libvips. Equivalent +# to the following shell invocation: +# +# vipsthumbnail 64661.pdf -s 600 -o '%s.webp[Q=89,strip]' +# +# vips is faster than GraphicsMagick/ImageMagick, uses less memory, and seems +# to generate better quality images. Note that libvips uses poppler instead of +# Ghostscript, which means that CMYK colorspace is not supported. We might need +# to do something about that... +# +# See: https://github.com/libvips/libvips/issues/379 +def create_thumbnail(row): + filename = row[args.filename_field_name] + thumbnail = os.path.splitext(filename)[0] + ".webp" + # check if the file has been downloaded + if not os.path.isfile(filename): + if args.debug: + print(f"{Fore.YELLOW}> Missing {filename}.{Fore.RESET}") + # check if we already have a thumbnail + elif os.path.isfile(thumbnail): + if args.debug: + print( + f"{Fore.YELLOW}> Thumbnail for {filename} already exists.{Fore.RESET}" + ) + else: + print(f"{Fore.GREEN}> Creating thumbnail for {filename}...{Fore.RESET}") + vips_image = pyvips.Image.new_from_file(filename, access="sequential") + # Set max height to 600px + vips_thumbnail = vips_image.thumbnail_image(600) + vips_thumbnail.webpsave(thumbnail, Q=89, strip=True) + + return + + +def download_bitstream(row): + request_headers = {"user-agent": "CGSpace PDF bot"} + + # some records have multiple URLs separated by "||" + pattern = re.compile(r"\|\|") + urls = pattern.split(row[args.url_field_name]) + filenames = pattern.split(row[args.filename_field_name]) + for url, filename in zip(urls, filenames): + if args.debug: + print(f"URL: {url}") + print(f"File: {filename}") + + # check if file exists + if os.path.isfile(filename): + if args.debug: + print(Fore.YELLOW + f"> {filename} already downloaded." + Fore.RESET) + else: + if args.debug: + print(Fore.GREEN + f"> Downloading {filename}..." + Fore.RESET) + + response = requests.get(url, headers=request_headers, stream=True) + if response.status_code == 200: + with open(filename, "wb") as fd: + for chunk in response: + fd.write(chunk) + else: + print( + Fore.RED + + f"> Download failed (HTTP {response.status_code}), I will try again next time." + + Fore.RESET + ) + + return + + +if __name__ == "__main__": + # set the signal handler for SIGINT (^C) + signal.signal(signal.SIGINT, signal_handler) + + parser = argparse.ArgumentParser( + description="Download PDFs and generate thumbnails from files in a CSV." + ) + parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "-f", + "--filename-field-name", + help="Name of column with thumbnail filenames.", + default="filename", + ) + parser.add_argument( + "-u", + "--url-field-name", + help="Name of column with URLs for the PDFs.", + default="dc.description.url", + ) + parser.add_argument( + "-w", "--download-only", help="Only download the PDFs.", action="store_true" + ) + args = parser.parse_args() + + # open the CSV + reader = csv.DictReader(args.csv_file) + + # check if the filename and URL fields specified by the user exist in the CSV + if args.filename_field_name not in reader.fieldnames: + sys.stderr.write( + f"{Fore.RED}Specified field '{args.filename_field_name}' does not exist in the CSV.\n{Fore.RESET}" + ) + sys.exit(1) + if args.url_field_name not in reader.fieldnames: + sys.stderr.write( + f"{Fore.RED}Specified field '{args.url_field_name}' does not exist in the CSV.\n{Fore.RESET}" + ) + sys.exit(1) + + rows_to_process = [ + row + for row in reader + if row[args.url_field_name] and row[args.filename_field_name] + ] + + for row in rows_to_process: + download_bitstream(row) + + if args.download_only is not True: + create_thumbnail(row) diff --git a/ilri/get_pdfs_dspace.py b/ilri/get_pdfs_dspace.py new file mode 100755 index 000000000000..40c5727544df --- /dev/null +++ b/ilri/get_pdfs_dspace.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# +# get_pdfs_dspace.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries a DSpace 6 REST API for bitstreams from a list of handles and then +# downloads them if they are PDFs. Input file is hardcoded at /tmp/handles.txt +# and should have one handle per line, for example: +# +# 10568/93010 +# 10568/75869 +# +# The original use for this was to download a list of PDFs corresponding with +# a certain search result. I generated the list of handles by extracting them +# from the results of an OpenSearch query where the user had asked for all the +# items matching the term "trade off" in the WLE community: +# +# $ http 'https://cgspace.cgiar.org/open-search/discover?scope=10568%2F34494&query=trade+off&rpp=100&start=0' User-Agent:'curl' > /tmp/wle-trade-off-page1.xml +# $ xmllint --xpath '//*[local-name()="entry"]/*[local-name()="id"]/text()' /tmp/wle-trade-off-page1.xml >> /tmp/ids.txt +# # ... and on and on for each page of results... +# $ sort -u /tmp/ids.txt > /tmp/ids-sorted.txt +# $ grep -oE '[0-9]+/[0-9]+' /tmp/ids.txt > /tmp/handles.txt +# +# This script is written for Python 3.7+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import logging +import os.path +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def resolve_bitstreams(handle): + url = f"{rest_base_url}/{rest_handle_endpoint}/{handle}" + request_params = {"expand": "bitstreams"} + request_headers = {"user-agent": rest_user_agent, "Accept": "application/json"} + response = requests.get(url, params=request_params, headers=request_headers) + + if response.status_code == 200: + bitstreams = response.json()["bitstreams"] + + if len(bitstreams) > 0: + pdf_bitstream_ids = list() + + for bitstream in bitstreams: + if bitstream["format"] == "Adobe PDF": + pdf_bitstream_ids.append(bitstream["uuid"]) + + if len(pdf_bitstream_ids) > 0: + download_bitstreams(pdf_bitstream_ids) + + return + + +def download_bitstreams(pdf_bitstream_ids): + import re + + for pdf_bitstream_id in pdf_bitstream_ids: + url = f"{rest_base_url}/{rest_bitstream_endpoint}/{pdf_bitstream_id}/retrieve" + request_headers = { + "user-agent": rest_user_agent, + } + + # do a HEAD request first to get the filename from the content disposition header + # See: https://stackoverflow.com/questions/31804799/how-to-get-pdf-filename-with-python-requests + response = requests.head(url, headers=request_headers) + + if response.status_code == 200: + content_disposition = response.headers["content-disposition"] + filename = re.findall("filename=(.+)", content_disposition)[0] + # filenames in the header have quotes so let's strip them in a super hacky way + filename_stripped = filename.strip('"') + logger.debug(f"> filename: {filename_stripped}") + + # check if file exists + if os.path.isfile(filename_stripped): + logger.debug( + Fore.YELLOW + + "> {} already downloaded.".format(filename_stripped) + + Fore.RESET + ) + else: + logger.info( + Fore.GREEN + + "> Downloading {}...".format(filename_stripped) + + Fore.RESET + ) + + response = requests.get( + url, headers={"user-agent": rest_user_agent}, stream=True + ) + if response.status_code == 200: + with open(filename_stripped, "wb") as fd: + for chunk in response: + fd.write(chunk) + else: + logger.error( + Fore.RED + + "> Download failed, I will try again next time." + + Fore.RESET + ) + + return + + +rest_base_url = "https://cgspace.cgiar.org/rest" +rest_handle_endpoint = "handle" +rest_bitstream_endpoint = "bitstreams" +rest_user_agent = "get_pdfs_dspace.py/0.0.2 (python / curl)" + +# Set local logging level to INFO +logger.setLevel(logging.INFO) +# Set the global log format to display just the message without the log level +logging.basicConfig(format="%(message)s") + +with open("/tmp/handles.txt", "r") as fd: + handles = fd.readlines() + +# Set up a transparent requests cache to be nice to the REST API +expire_after = timedelta(days=30) +requests_cache.install_cache("requests-cache", expire_after=expire_after) + +# prune old cache entries +requests_cache.delete() + +for handle in handles: + # strip the handle because it has a line feed (%0A) + handle = handle.strip() + + logger.info(f"Checking for PDF bitstreams in {handle}") + + resolve_bitstreams(handle) diff --git a/ilri/get_pdfs_scihub.py b/ilri/get_pdfs_scihub.py new file mode 100755 index 000000000000..7e56d572da20 --- /dev/null +++ b/ilri/get_pdfs_scihub.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# +# get_pdfs_scihub.py 0.0.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Attempts to download PDFs for given DOIs from Sci-Hub. We only do this for +# items we know are licensed Creative Commons (though not "ND"). The idea is +# to download the PDFs in order to create and upload thumbnails to CGSpace, +# not to upload the PDFs themselves (yet?). +# +# Input file should have one DOI per line, for example: +# +# https://doi.org/10.5194/bg-18-1481-2021 +# https://doi.org/10.5194/gmd-14-3789-2021 +# +# This script is written for Python 3.7+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama scidownl +# + +import argparse +import csv +import logging +import os.path +import signal +import sys + +import util +from colorama import Fore +from scidownl import scihub_download + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +def download_pdf(doi): + logger.info(f"Processing {doi}") + + filename = doi.replace("/", "-") + ".pdf" + filename = os.path.join(args.output_directory, filename) + + # check if file exists already + if os.path.isfile(filename): + logger.debug(Fore.GREEN + f"> {filename} already downloaded." + Fore.RESET) + + return + else: + logger.debug( + Fore.GREEN + f"> Attempting to download PDF for {doi}" + Fore.RESET + ) + + scihub_download(doi, paper_type="doi", out=filename) + + # check if the file was downloaded, since we have no way to know if it was + # successful. + if os.path.isfile(filename): + logger.info(Fore.YELLOW + f"> Successfully saved to: {filename}" + Fore.RESET) + else: + logger.debug(Fore.RED + "> Download unsuccessful." + Fore.RESET) + + +if __name__ == "__main__": + # set the signal handler for SIGINT (^C) + signal.signal(signal.SIGINT, signal_handler) + + parser = argparse.ArgumentParser(description="Download PDFs from Sci-Hub.") + parser.add_argument( + "-i", + "--input-file", + help="Path to input file.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument( + "-o", + "--output-directory", + help="Name of directory to save files.", + required=False, + default=".", + ) + parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", + ) + args = parser.parse_args() + + # The default log level is WARNING, but we want to set it to DEBUG or INFO + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + # Set the global log format + logging.basicConfig(format="[%(levelname)s] %(message)s") + + dois = util.read_dois_from_file(args.input_file) + + for doi in dois: + download_pdf(doi) diff --git a/ilri/get_pdfs_unpaywall.py b/ilri/get_pdfs_unpaywall.py new file mode 100755 index 000000000000..b4ce702b68ba --- /dev/null +++ b/ilri/get_pdfs_unpaywall.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# +# get_pdfs_unpaywall.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Unpaywall API for DOIs read from a text file, one per line, +# and attempts to download fulltext PDFs. +# +import argparse +import logging +import os +import re +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +import util +from colorama import Fore + +# Create a root logger instance so that submodules can inherit our config. +# See: https://gist.github.com/gene1wood/73b715434c587d2240c21fc83fad7962#explanation-of-the-relationship-between-python-logging-root-logger-and-other-loggers +logger = logging.getLogger() + + +def resolve_doi(doi: str) -> None: + logger.info(f"Looking up DOI: {doi}") + + # Set filename based on DOI so we can check whether it has already been + # downloaded, ie: 10.3402/iee.v6.31191 → 10.3402-iee.v6.31191.pdf + pdf_filename = doi.replace("/", "-") + ".pdf" + pdf_file_path = os.path.join(args.output_directory, pdf_filename) + + # Check if file exists already so we can return early if so + if os.path.isfile(pdf_file_path): + logger.debug(Fore.GREEN + f"> {pdf_file_path} already downloaded." + Fore.RESET) + + return + + # Fetch the metadata for this DOI + request_url = f"https://api.unpaywall.org/v2/{doi}" + request_params = {"email": args.email} + + try: + request = requests.get(request_url, params=request_params) + except requests.exceptions.ConnectionError: + logger.error(Fore.RED + "Connection error." + Fore.RESET) + + # I guess we have to exit + sys.exit(1) + + # Fail early if the DOI is not found in Unpaywall + if not request.ok: + logger.debug(f"> DOI not in Unpaywall (cached: {request.from_cache})") + + return + + logger.debug(f"> DOI in Unpaywall (cached: {request.from_cache})") + + data = request.json() + + file_downloaded = False + for oa_location in data["oa_locations"]: + if not file_downloaded: + try: + url_for_pdf = oa_location["url_for_pdf"] + + # Make sure there is actually something here, sometimes + # the value is blank! Bail out early to check the next + # source + if not url_for_pdf: + continue + + logger.info( + Fore.YELLOW + + f"> Attempting to download: {url_for_pdf}" + + Fore.RESET + ) + + # Try to download the file from this OA location + if util.download_file(url_for_pdf, pdf_file_path): + logger.info( + Fore.YELLOW + + f"> Successfully saved to: {pdf_file_path}" + + Fore.RESET + ) + + file_downloaded = True + else: + logger.debug(Fore.RED + "> Download unsuccessful." + Fore.RESET) + + # I guess this OA location is stale + file_downloaded = False + except: + # no PDF URL in this oa_location, try the next + continue + + +def signal_handler(signal, frame): + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Unpaywall REST API for metadata about DOIs." +) +parser.add_argument( + "-e", + "--email", + required=True, + help="Contact email to use in API requests so Unpaywall is more lenient with our request rate.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing DOIs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-directory", + help="Name of directory to save files.", + required=False, + default=".", +) +args = parser.parse_args() + +# Since we are running interactively we can override the log level and format. +# The default log level is WARNING, but we want to set it to DEBUG or INFO. +if args.debug: + logger.setLevel(logging.DEBUG) + logging.basicConfig(format="[D] %(message)s") +else: + logger.setLevel(logging.INFO) + logging.basicConfig(format="[I] %(message)s") + +# Install a transparent request cache +expire_after = timedelta(days=30) +requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) +) +requests_cache.delete() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the DOIs from there +if args.input_file: + dois = util.read_dois_from_file(args.input_file) + for doi in dois: + resolve_doi(doi) diff --git a/ilri/iso3166_lookup.py b/ilri/iso3166_lookup.py new file mode 100755 index 000000000000..356edfebc157 --- /dev/null +++ b/ilri/iso3166_lookup.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# +# iso3166-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the ISO 3166 dataset for countries read from a text file. Text file +# should have one organization per line. Results are saved to a CSV including +# the country name, whether it matched or not, and the type of match. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama pycountry requests requests-cache +# + +import argparse +import csv +import signal +import sys + +import pycountry +from colorama import Fore + + +# read countries from a text file, one per line +def read_countries_from_file(): + # initialize an empty list for countries + countries = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add organization that aren't already present + if line not in countries: + countries.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_countries(countries) + + +def resolve_countries(countries): + fieldnames = ["country", "match type", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for country in countries: + if args.debug: + sys.stderr.write( + Fore.GREEN + f"Looking up the country: {country!r}\n" + Fore.RESET + ) + + # check for exact match + if country.lower() in country_names: + print(f"Name match for {country!r}") + + writer.writerow( + {"country": country, "match type": "name", "matched": "true"} + ) + elif country.lower() in country_official_names: + print(f"Official name match for {country!r}") + + writer.writerow( + {"country": country, "match type": "official_name", "matched": "true"} + ) + elif country.lower() in country_common_names: + print(f"Common name match for {country!r}") + + writer.writerow( + { + "country": country, + "match type": "common_name", + "matched": "true", + } + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + f"No match for {country!r}\n" + Fore.RESET + ) + + writer.writerow( + { + "country": country, + "match type": "", + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query ISO 3166-1 to validate countries from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing countries to look up in ISO 3166-1 and ISO 3166-3.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# create empty lists to hold country names +country_names = [] +country_official_names = [] +country_common_names = [] + +# iterate over countries and append names to the appropriate lists. We can't use +# a list comprehension here because some countries don't have official_name, etc +# and they raise an AttributeError. Anyways, it's more efficient to iterate over +# the list of countries just once. +for country in pycountry.countries: + country_names.append(country.name.lower()) + + try: + country_official_names.append(country.official_name.lower()) + except AttributeError: + pass + + try: + country_common_names.append(country.common_name.lower()) + except AttributeError: + pass + +# Add names for historic countries from ISO 3166-3 +for country in pycountry.historic_countries: + country_names.append(country.name.lower()) + + try: + country_official_names.append(country.official_name.lower()) + except AttributeError: + pass + + try: + country_common_names.append(country.common_name.lower()) + except AttributeError: + pass + +read_countries_from_file() + +exit() diff --git a/ilri/iso_639_value_pairs.py b/ilri/iso_639_value_pairs.py new file mode 100755 index 000000000000..78e488ee6dc8 --- /dev/null +++ b/ilri/iso_639_value_pairs.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# +# Ghetto script to export value pairs for ISO 639-1 Alpha 2 codes from pycountry + +import pycountry + +for language in pycountry.languages: + try: + language.alpha_2 + except: + continue + + print(" ") + print(f" {language.name}") + print(f" {language.alpha_2}") + print(" ") + +print(" ") +print(" N/A") +print(" ") +print(" ") +print(" ") +print(" (Other)") +print(" other") +print(" ") diff --git a/ilri/migrate-fields.sh b/ilri/migrate-fields.sh new file mode 100755 index 000000000000..4b89e10563d3 --- /dev/null +++ b/ilri/migrate-fields.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# +# Moves DSpace metadatavalues from one field to another. Assumed to be running +# as the `postgres` Linux user. You MUST perform a full Discovery reindex after +# doing this, ie: index-discovery -bf +# +# Alan Orth, April, 2016 + +# Exit on first error +set -o errexit + +# Names of fields to move, in this format: +# +# old_field new_field +# +# fields are separated with tabs or spaces. Uses bash's `mapfile` to read into +# an array. +mapfile -t fields_to_move < 0: + if args.dry_run: + if not args.quiet: + print( + f"{Fore.GREEN}Would move {cursor.rowcount} occurences of: {line}{Fore.RESET}" + ) + + # Since this a dry run we can continue to the next line + continue + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + sql = "UPDATE metadatavalue SET metadata_field_id=%s WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value=%s" + cursor.execute( + sql, + ( + to_field_id, + from_field_id, + line, + ), + ) + + if cursor.rowcount > 0: + if not args.quiet: + print( + f"{Fore.GREEN}Moved {cursor.rowcount} occurences of: {line}{Fore.RESET}" + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[0]) + +# close database connection before we exit +conn.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/orcid_authority_to_item.py b/ilri/orcid_authority_to_item.py new file mode 100755 index 000000000000..a2901beb223b --- /dev/null +++ b/ilri/orcid_authority_to_item.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +# +# orcid-authority-to-item.py 1.1.1 +# +# Copyright Alan Orth. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# --- +# +# Map ORCID identifiers from DSpace's Solr authority core by creating new cg.creator.id +# fields in each matching item. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama psycopg2-binary requests requests-cache +# + +import argparse +import signal +import sys +from datetime import timedelta + +import psycopg2 +import requests +import requests_cache +from colorama import Fore + + +def main(): + # parse the command line arguments + parser = argparse.ArgumentParser( + description="Map ORCID identifiers from the DSpace Solr authority core to cg.creator.id fields in each item." + ) + parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", + ) + parser.add_argument("-db", "--database-name", help="Database name", required=True) + parser.add_argument( + "-u", "--database-user", help="Database username", required=True + ) + parser.add_argument( + "-p", "--database-pass", help="Database password", required=True + ) + parser.add_argument( + "-s", + "--solr-url", + help="URL of Solr application", + default="http://localhost:8080/solr", + ) + args = parser.parse_args() + + # set the signal handler for SIGINT (^C) so we can exit cleanly + signal.signal(signal.SIGINT, signal_handler) + + # get all ORCID identifiers from Solr authority core + read_identifiers_from_solr(args) + + +# query DSpace's authority Solr core for authority IDs with ORCID identifiers +def read_identifiers_from_solr(args): + # simple query from the 'authority' collection 2000 rows at a time (default is 1000) + solr_query_params = {"q": "orcid_id:*", "wt": "json", "rows": 2000} + + solr_url = args.solr_url + "/authority/select" + + res = requests.get(solr_url, params=solr_query_params) + + if args.debug: + numFound = res.json()["response"]["numFound"] + sys.stderr.write( + Fore.GREEN + + "Total number of Solr records with ORCID iDs: {0}\n".format( + str(numFound) + Fore.RESET + ) + ) + + # initialize an empty dictionary for authorities + # format will be: {'d7ef744b-bbd4-4171-b449-00e37e1b776f': '0000-0002-3476-272X', ...} + authorities = {} + + docs = res.json()["response"]["docs"] + # iterate over results and add ORCID iDs that aren't already in the list + # for example, we had 1600 ORCID iDs in Solr, but only 600 are unique + for doc in docs: + if doc["id"] not in authorities: + authorities.update({doc["id"]: doc["orcid_id"]}) + + add_orcid_identifiers(args, authorities) + + +# Query ORCID's public API for names associated with an identifier. Prefers to use +# the "credit-name" field if it is present, otherwise will default to using the +# "given-names" and "family-name" fields. +def resolve_orcid_identifier(args, orcid): + # ORCID API endpoint, see: https://pub.orcid.org + orcid_api_base_url = "https://pub.orcid.org/v2.1/" + orcid_api_endpoint = "/person" + + # fetch names associated with an ORCID identifier from the ORCID API + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Looking up the names associated with ORCID iD: {0}\n".format(orcid) + + Fore.RESET + ) + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + + # cache HTTP 200 and 404 responses, because ORCID uses HTTP 404 when an identifier doesn't exist + requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) + ) + + # build request URL for current ORCID ID + request_url = orcid_api_base_url + orcid.strip() + orcid_api_endpoint + + # ORCID's API defaults to some custom format, so tell it to give us JSON + request = requests.get(request_url, headers={"Accept": "application/json"}) + + # prune old cache entries + requests_cache.delete() + + # Check the request status + if request.status_code == requests.codes.ok: + # read response JSON into data + data = request.json() + + # make sure name element is not null + if data["name"]: + # prefer credit-name if present and not blank + if ( + data["name"]["credit-name"] + and data["name"]["credit-name"]["value"] != "" + ): + line = data["name"]["credit-name"]["value"] + # otherwise use given-names + family-name + # make sure given-names is not null + elif data["name"]["given-names"]: + line = data["name"]["given-names"]["value"] + # make sure family-name is not null + if data["name"]["family-name"]: + line = line + " " + data["name"]["family-name"]["value"] + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Warning: ignoring null family-name element.\n" + + Fore.RESET + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Warning: skipping identifier with null name element.\n\n" + + Fore.RESET + ) + # HTTP 404 means that the API url or identifier was not found. If the + # API URL is correct, let's assume that the identifier was not found. + elif request.status_code == 404: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Warning: skipping missing identifier (API request returned HTTP 404).\n\n" + + Fore.RESET + ) + else: + sys.stderr.write(Fore.RED + "Error: request failed.\n" + Fore.RESET) + exit(1) + + return line + + +def add_orcid_identifiers(args, authorities): + # connect to database + try: + conn_string = "dbname={0} user={1} password={2} host=localhost".format( + args.database_name, args.database_user, args.database_pass + ) + conn = psycopg2.connect(conn_string) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Connected to the database.\n" + Fore.RESET) + except psycopg2.OperationalError: + sys.stderr.write(Fore.RED + "Unable to connect to the database.\n" + Fore.RESET) + exit(1) + + # iterate over all authorities + for authority_id in authorities: + # save orcid for current authority a little more cleanly + orcid = authorities[authority_id] + + # get name associated with this orcid identifier + name = resolve_orcid_identifier(args, orcid) + creator = "{0}: {1}".format(name, orcid) + + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Processing authority ID {0} with ORCID iD: {1}\n".format( + authority_id, orcid + ) + + Fore.RESET + ) + + with conn: + # cursor will be closed after this block exits + # see: http://initd.org/psycopg/docs/usage.html#with-statement + with conn.cursor() as cursor: + # find all metadata records with this authority id + # resource_type_id 2 is item metadata, metadata_field_id 3 is author + sql = "SELECT resource_id, place FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=3 AND authority=%s" + # remember that tuples with one item need a comma after them! + cursor.execute(sql, (authority_id,)) + records_with_authority = cursor.fetchall() + + if len(records_with_authority) >= 0: + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Checking {0} items for authority ID {1}.\n".format( + len(records_with_authority), authority_id + ) + + Fore.RESET + ) + + # iterate over results for current authority_id to add cg.creator.id metadata + for record in records_with_authority: + resource_id = record[0] + # author name and orcid identifier + text_value = creator + place = record[1] + confidence = -1 + + # get the metadata_field_id for cg.creator.id field + sql = "SELECT metadata_field_id FROM metadatafieldregistry WHERE metadata_schema_id=2 AND element='creator' AND qualifier='id'" + cursor.execute(sql) + metadata_field_id = cursor.fetchall()[0] + + # first, check if there is an existing cg.creator.id here (perhaps the script crashed before?) + # resource_type_id 2 is item metadata + sql = "SELECT * from metadatavalue WHERE resource_id=%s AND metadata_field_id=%s AND text_value=%s AND place=%s AND confidence=%s AND resource_type_id=2" + cursor.execute( + sql, + ( + resource_id, + metadata_field_id, + text_value, + place, + confidence, + ), + ) + records_with_orcid = cursor.fetchall() + + if len(records_with_orcid) == 0: + print( + "Adding ORCID identifier to item {0}: {1}".format( + resource_id, creator + ) + ) + + # metadatavalue IDs come from a PostgreSQL sequence that increments when you call it + cursor.execute("SELECT nextval('metadatavalue_seq')") + metadata_value_id = cursor.fetchone()[0] + + sql = "INSERT INTO metadatavalue (metadata_value_id, resource_id, metadata_field_id, text_value, place, confidence, resource_type_id) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute( + sql, + ( + metadata_value_id, + resource_id, + metadata_field_id, + text_value, + place, + confidence, + 2, + ), + ) + else: + if args.debug: + sys.stderr.write( + Fore.GREEN + + "Item {0} already has an ORCID identifier for {1}.\n".format( + resource_id, creator + ) + + Fore.RESET + ) + + if args.debug: + sys.stderr.write(Fore.GREEN + "Disconnecting from database.\n" + Fore.RESET) + + # close the database connection before leaving + conn.close() + + +def signal_handler(signal, frame): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ilri/parse_iso_codes.py b/ilri/parse_iso_codes.py new file mode 100755 index 000000000000..d8000aad0770 --- /dev/null +++ b/ilri/parse_iso_codes.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# +# parse-iso-codes.py v0.0.1 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the Research Organization Registry dataset for organizations read +# from a text file. Text file should have one organization per line. Results +# are saved to a CSV including the organization and whether it matched or not. +# +# This script is written for Python 3.6+. +# + +import argparse +import json +import signal +import sys + + +def choose_country_name(country: dict): + # Prefer the common name if it exists! Otherwise, prefer the shorter of name + # and official_name. + try: + return country["common_name"] + except KeyError: + pass + + try: + country_name = country["name"] + except KeyError: + country_name = False + + try: + country_official_name = country["official_name"] + except KeyError: + country_official_name = False + + if country_name and not country_official_name: + return country_name + + if country_official_name and not country_name: + return country_official_name + + if len(country["name"]) < len(country["official_name"]): + return country["name"] + else: + return country["official_name"] + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Parse iso_3166-1.json from Debian's iso-codes package to a list of countries." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="Path to iso_3166-1.json from Debian iso-codes package.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# read the list of countries +countries_json = json.load(args.input_file) + +for country in countries_json["3166-1"]: + country_name = choose_country_name(country) + + args.output_file.write(f"{country_name}\n") + +args.input_file.close() +args.output_file.close() + +exit() diff --git a/ilri/post_bitstreams.py b/ilri/post_bitstreams.py new file mode 100755 index 000000000000..541f25d0f194 --- /dev/null +++ b/ilri/post_bitstreams.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python3 +# +# post_bitstreams.py 0.1.3 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A script to read item IDs and filenames from a CSV file and update existing +# items in a DSpace repository via the REST API. Specify an email for a DSpace +# user with administrator privileges when running: +# +# $ ./post_bitsreams.py -i items.csv -e me@example.com -p 'fuu!' +# +# The CSV input file should have DSpace item IDs, filenames, and bundle names, +# for example: +# +# id,filename,bundle +# 804351af-64eb-4e4a-968f-4d3be61358a8,file1.pdf__description:Report,ORIGINAL +# 82b8c92c-fd6e-4b30-a704-5fbdc1cc6d1c,file2.pdf__description:Journal Article,ORIGINAL +# 82b8c92c-fd6e-4b30-a704-5fbdc1cc6d1c,thumbnail.png__description:libvips thumbnail,THUMBNAIL +# +# Optionally specify the bitstream description using the SAFBuilder syntax. +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# TODO: allow overwriting by bitstream description +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests +# + +import argparse +import csv +import logging +import os.path +import signal +import sys + +import requests +from colorama import Fore + +# Create a local logger instance for this module. We don't do any configuration +# because this module might be used elsewhere that will have its own logging +# configuration. +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +def login(user: str, password: str): + """Log into the DSpace REST API. + + Equivalent to the following request with httpie or curl: + + $ http -f POST http://localhost:8080/rest/login email=aorth@fuuu.com password='fuuuuuu' + + :param user: email of user with permissions to update the item (should probably be an admin). + :param password: password of user. + :returns: JSESSION value for the session. + """ + + request_url = rest_login_endpoint + headers = {"user-agent": user_agent} + data = {"email": args.user, "password": args.password} + + logger.info("Logging in...") + + try: + request = requests.post(rest_login_endpoint, headers=headers, data=data) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + if request.status_code != requests.codes.ok: + logger.error(Fore.RED + "> Login failed." + Fore.RESET) + + sys.exit(1) + + try: + jsessionid = request.cookies["JSESSIONID"] + except KeyError: + logger.error( + Fore.RED + + f"Login failed (HTTP {request.status_code}): missing JESSSIONID cookie in response...?" + + Fore.RESET + ) + + sys.exit(1) + + logger.debug( + Fore.GREEN + f"Login successful, new JSESSIONID: {jsessionid}" + Fore.RESET + ) + + return jsessionid + + +def check_session(jsessionid: str): + """Check the authentication status of the specified JSESSIONID. + + :param jsessionid: JSESSIONID value for a previously authenticated session. + :returns: bool + """ + + request_url = rest_status_endpoint + headers = {"user-agent": user_agent, "Accept": "application/json"} + cookies = {"JSESSIONID": jsessionid} + + logger.debug(f"Checking status of existing session: {jsessionid}" + Fore.RESET) + + try: + request = requests.get(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + if request.status_code == requests.codes.ok: + if not request.json()["authenticated"]: + logger.warning(Fore.RED + f"Session expired: {jsessionid}" + Fore.RESET) + + return False + else: + logger.error(Fore.RED + "Error checking session status." + Fore.RESET) + + return False + + logger.debug(Fore.GREEN + f"Session valid: {jsessionid}" + Fore.RESET) + + return True + + +def check_item(item_id: str, bundle: str): + """Check if the item already has bitstreams. + + Equivalent to the following request with httpie or curl: + + $ http 'http://localhost:8080/rest/items/804351af-64eb-4e4a-968f-4d3be61358a8?expand=bitstreams,metadata' \ + Cookie:JSESSIONID=B3B9C82F257BCE1773E6FB1EA5ACD774 + + By default this will return True if the item has any bitstreams in the named + bundle and False if the bundle is empty. If the user has asked to overwrite + bitstreams then we will do that first, and return False once the bundle is + empty. + + :param item_id: uuid of item in the DSpace repository. + :returns: bool + """ + + request_url = f"{rest_items_endpoint}/{item_id}" + headers = {"user-agent": user_agent} + # Not strictly needed here for permissions, but let's give the session ID + # so that we don't allocate unecessary resources on the server. + cookies = {"JSESSIONID": jsessionid} + request_params = {"expand": "bitstreams,metadata"} + + try: + request = requests.get( + request_url, headers=headers, cookies=cookies, params=request_params + ) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + # If the item doesn't exist, return True early so we don't try to upload a + # bitstream + if request.status_code == 404: + logger.warning(Fore.RED + "Item not found." + Fore.RESET) + return True + + if request.status_code == requests.codes.ok: + data = request.json() + + # List comprehension to filter out bitstreams that belong to the bundle + # we're interested in + bitstreams_in_bundle = [ + bitstream + for bitstream in data["bitstreams"] + if bitstream["bundleName"] == bundle + ] + + if len(bitstreams_in_bundle) == 0: + # Return False, meaning the item does not have a bitstream in this bundle yet + return False + + # We have bitstreams, so let's see if the user wants to overwrite them + if args.overwrite_format: + bitstreams_to_overwrite = [ + bitstream + for bitstream in bitstreams_in_bundle + if bitstream["format"] in args.overwrite_format + ] + + # Item has bitstreams, but none matching our overwrite format. Let's + # err on the side of caution and return True so that we don't upload + # another one into the bundle. + if len(bitstreams_to_overwrite) == 0: + logger.debug( + "Existing bitstreams, but none matching our overwrite formats." + ) + + return True + + for bitstream in bitstreams_to_overwrite: + if args.dry_run: + logger.info( + Fore.YELLOW + + f"> (DRY RUN) Deleting bitstream: {bitstream['name']} ({bitstream['uuid']})" + + Fore.RESET + ) + + else: + if delete_bitstream(bitstream["uuid"]): + logger.info( + Fore.YELLOW + + f"> Deleted bitstream: {bitstream['name']} ({bitstream['uuid']})" + + Fore.RESET + ) + + # Return False, indicating there are no bitstreams in this bundle + return False + else: + logger.debug( + f"> Skipping item with existing bitstream(s) in {bundle} bundle" + ) + + return True + + # If we get here, assume the item has a bitstream and return True so we + # don't upload another. + return True + + +def delete_bitstream(bitstream_id: str): + """Delete a bitstream. + + Equivalent to the following request with httpie or curl: + + $ http DELETE 'http://localhost:8080/rest/bitstreams/fca0fd2a-630e-4a34-b260-f645c8f2b027' \ + Cookie:JSESSIONID=B3B9C82F257BCE1773E6FB1EA5ACD774 + + :param bitstream_id: uuid of bitstream in the DSpace repository. + :returns: bool + """ + + request_url = f"{rest_bitstreams_endpoint}/{bitstream_id}" + headers = {"user-agent": user_agent} + cookies = {"JSESSIONID": jsessionid} + + try: + request = requests.delete(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + logger.error( + Fore.RED + + f"> Could not connect to REST API: {args.request_url}" + + Fore.RESET + ) + + sys.exit(1) + + if request.status_code == requests.codes.ok: + return True + else: + return False + + +def upload_file(item_id: str, bundle: str, filename: str, description): + """Upload a file to an existing item in the DSpace repository. + + Equivalent to the following request with httpie or curl: + + http POST \ + 'http://localhost:8080/rest/items/21c0db9d-6c35-4111-9ca1-2c1345f44e40/bitstreams?name=file.pdf&description=Book&bundleName=ORIGINAL' \ + Cookie:JSESSIONID=0BDB219712F4F7DDB6055C1906F3E24B < file.pdf + + :param item_id: UUID of item to post the file to. + :param bundle: Name of the bundle to upload bitstream to, ie ORIGINAL, THUMBNAIL, etc (will be created if it doesn't exist). + :param filename: Name of the file to upload (must exist in the same directory as the script). + :param description: Bitstream description for this file. + :returns: bool + """ + + request_url = f"{rest_items_endpoint}/{item_id}/bitstreams" + headers = {"user-agent": user_agent} + cookies = {"JSESSIONID": jsessionid} + + # Description is optional + if description: + request_params = { + "name": filename, + "bundleName": bundle, + "description": description, + } + else: + request_params = {"name": filename, "bundleName": bundle} + + try: + with open(filename, "rb") as file: + # I'm not sure why, but we need to use data instead of files here + # See: https://stackoverflow.com/questions/12385179/how-to-send-a-multipart-form-data-with-requests-in-python + # See: https://stackoverflow.com/questions/43500502/send-file-through-post-without-content-disposition-in-python + request = requests.post( + request_url, + headers=headers, + cookies=cookies, + params=request_params, + data=file.read(), + ) + except requests.ConnectionError: + logger.error( + Fore.RED + f"> Could not connect to REST API: {request_url}" + Fore.RESET + ) + + sys.exit(1) + except FileNotFoundError: + logger.error(Fore.RED + f"> Could not open {filename}" + Fore.RESET) + + return False + + if request.status_code == requests.codes.ok: + return True + else: + logger.error(Fore.RED + f"> Error uploading file: {filename}" + Fore.RESET) + + return False + + +if __name__ == "__main__": + # Set the signal handler for SIGINT (^C) + signal.signal(signal.SIGINT, signal_handler) + + parser = argparse.ArgumentParser( + description="Post bitstreams to existing items in a DSpace 6.x repository." + ) + parser.add_argument( + "-d", "--debug", help="Print debug messages.", action="store_true" + ) + parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", + ) + parser.add_argument( + "-u", + "--rest-url", + help="URL of the DSpace 6.x REST API.", + default="http://localhost:8080/rest", + ) + parser.add_argument("-e", "--user", help="Email address of administrator user.") + parser.add_argument( + "--overwrite-format", + help="Bitstream formats to overwrite. Specify multiple formats separated by a space. Use this carefully, test with dry run first!", + choices=["PNG", "JPEG", "GIF", "Adobe PDF", "WebP"], + action="extend", + nargs="+", + ) + parser.add_argument("-p", "--password", help="Password of administrator user.") + parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "-s", "--jsessionid", help="JESSIONID, if previously authenticated." + ) + args = parser.parse_args() + + # The default log level is WARNING, but we want to set it to DEBUG or INFO + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + # Since we're running interactively we can set the preferred log format for + # the logging module during this invocation. + logging.basicConfig(format="[%(levelname)s] %(message)s") + + # DSpace 6.x REST API base URL and endpoints + rest_base_url = args.rest_url + rest_login_endpoint = f"{rest_base_url}/login" + rest_status_endpoint = f"{rest_base_url}/status" + rest_items_endpoint = f"{rest_base_url}/items" + rest_bitstreams_endpoint = f"{rest_base_url}/bitstreams" + user_agent = "Alan Orth (ILRI) Python bot" + + # If the user passed a session ID then we should check if it is valid first. + # Otherwise we should login and get a new session. + if args.jsessionid: + if check_session(args.jsessionid): + jsessionid = args.jsessionid + else: + jsessionid = login(args.user, args.password) + else: + jsessionid = login(args.user, args.password) + + try: + # Open the CSV + reader = csv.DictReader(args.csv_file) + + logger.debug(f"Opened {args.csv_file.name}") + except FileNotFoundError: + logger.error(Fore.RED + f"Could not open {args.csv_file.name}" + Fore.RESET) + + # Check if the required fields exist in the CSV + for field in ["id", "filename", "bundle"]: + if field not in reader.fieldnames: + logger.error( + Fore.RED + + f"Expected field {field} does not exist in the CSV." + + Fore.RESET + ) + + sys.exit(1) + + for row in reader: + item_id = row["id"] + bundle = row["bundle"] + + # Check if this item already has a bitstream in this bundle (check_item + # returns True if the bundle already has a bitstream). + logger.info(f"{item_id}: checking for existing bitstreams in {bundle} bundle") + + if not check_item(item_id, bundle): + # Check if there is a description for this filename + try: + filename = row["filename"].split("__description:")[0] + description = row["filename"].split("__description:")[1] + except IndexError: + filename = row["filename"].split("__description:")[0] + description = False + + if not os.path.isfile(filename): + logger.info( + f"{Fore.YELLOW}> File not found, skipping: {filename}{Fore.RESET}" + ) + + continue + + if args.dry_run: + logger.info( + f"{Fore.YELLOW}> (DRY RUN) Uploading file: {filename}{Fore.RESET}" + ) + else: + if upload_file(item_id, bundle, filename, description): + logger.info( + f"{Fore.YELLOW}> Uploaded file: {filename} ({bundle}){Fore.RESET}" + ) diff --git a/ilri/post_ciat_pdfs.py b/ilri/post_ciat_pdfs.py new file mode 100755 index 000000000000..4d4d2313818d --- /dev/null +++ b/ilri/post_ciat_pdfs.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +# +# post-ciat-pdfs.py 0.0.1 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A script to read item IDs and URLs from a CSV file and update existing items +# in a DSpace repository via the REST API. Developed when we had a corporate +# website with thousands of PDFs go offline and wanted to upload the PDFs to +# their existing metadata-only accessions in our respository. Specify an email +# and for a DSpace user with administrator privileges when running: +# +# $ ./post-ciat-pdfs.py -i items.csv -e me@example.com -p 'fuu!' +# +# The CSV input file should have DSpace item IDs (UUID) and URLs, ie: +# +# id,url +# 804351af-64eb-4e4a-968f-4d3be61358a8,http://example.com/library/file1.pdf +# 82b8c92c-fd6e-4b30-a704-5fbdc1cc6d1c,http://example.com/library/file2.pdf +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). If your CSV file has a large number of URLs +# to download you can run it first in download-only mode with the "-w" option. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests +# + +import argparse +import csv +import os.path +import signal +import sys +from urllib.parse import unquote, urlparse + +import requests +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +def login(user: str, password: str): + """Log into the DSpace REST API. + + Equivalent to the following request with httpie or curl: + + $ http -f POST http://localhost:8080/rest/login email=aorth@fuuu.com password='fuuuuuu' + + :param user: email of user with permissions to update the item (should probably be an admin). + :param password: password of user. + :returns: JSESSION value for the session. + """ + + headers = {"user-agent": user_agent} + data = {"email": args.user, "password": args.password} + + print("Logging in...") + + try: + request = requests.post(rest_login_endpoint, headers=headers, data=data) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + f" Could not connect to REST API: {rest_login_endpoint}\n" + + Fore.RESET + ) + + exit(1) + + if request.status_code != requests.codes.ok: + sys.stderr.write(Fore.RED + " Login failed.\n" + Fore.RESET) + + exit(1) + + jsessionid = request.cookies["JSESSIONID"] + + if args.debug: + sys.stderr.write( + Fore.GREEN + f" Logged in using JSESSIONID: {jsessionid}\n" + Fore.RESET + ) + + return jsessionid + + +def check_session(jsessionid: str): + """Check the authentication status of the specified JSESSIONID. + + :param jsessionid: JSESSIONID value for a previously authenticated session. + :returns: bool + """ + + request_url = rest_status_endpoint + headers = {"user-agent": user_agent, "Accept": "application/json"} + cookies = {"JSESSIONID": jsessionid} + + try: + request = requests.get(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + f" Could not connect to REST API: {args.request_url}\n" + + Fore.RESET + ) + + exit(1) + + if request.status_code == requests.codes.ok: + if not request.json()["authenticated"]: + sys.stderr.write( + Fore.RED + f" Session expired: {jsessionid}\n" + Fore.RESET + ) + + return False + else: + sys.stderr.write(Fore.RED + " Error checking session status.\n" + Fore.RESET) + + return False + + return True + + +def url_to_filename(url: str): + """Return filename from a URL. + + Uses the following process to extract the filename from a given URL: + + 1. Split path component on slash like ['docs', 'file.pdf'] + 2. Take last element ([-1]) + 3. URL unencode using unquote() so we don't have "file%20name.pdf" + + :param url: URL of a PDF file to download, for example "https://example.com/docs/file.pdf" + :returns: filename, for example "file.pdf" + """ + + return unquote(urlparse(url).path.split("/")[-1]) + + +def check_item(row: dict): + """Check if the item already has bitstreams. + + Equivalent to the following request with httpie or curl: + + $ http 'http://localhost:8080/rest/items/804351af-64eb-4e4a-968f-4d3be61358a8?expand=bitstreams,metadata' \ + Cookie:JSESSIONID=B3B9C82F257BCE1773E6FB1EA5ACD774 + + To be safe, and to save myself from having to write extra logic, we only + want to upload files to items that don't already have one. + + :param row: row from the CSV file containing the item ID and URL of a file to download. + """ + + url = row["url"] + item_id = row["id"] + + request_url = f"{rest_items_endpoint}/{item_id}" + headers = {"user-agent": user_agent} + # Not strictly needed here for permissions, but let's give the session ID + # so that we don't allocate unecessary resources on the server. + cookies = {"JSESSIONID": jsessionid} + request_params = {"expand": "bitstreams,metadata"} + + try: + request = requests.get( + request_url, headers=headers, cookies=cookies, params=request_params + ) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + f" Could not connect to REST API: {args.request_url}\n" + + Fore.RESET + ) + + exit(1) + + if request.status_code == requests.codes.ok: + data = request.json() + + if len(data["bitstreams"]) == 0: + filename = url_to_filename(url) + + # Find the item type so we can use it as the bitstream description. + # Note that we don't check for null or empty here. + for field in data["metadata"]: + if field["key"] == "dcterms.type": + item_type = field["value"] + + if args.debug: + print(f"{item_id}: uploading {filename}") + + if upload_file(item_id, filename, item_type): + print(Fore.YELLOW + f"{item_id}: uploaded {filename}" + Fore.RESET) + else: + if args.debug: + sys.stderr.write( + f"{item_id}: skipping item with existing bitstream(s)\n" + ) + + +def download_file(url: str): + filename = url_to_filename(url) + + request_headers = {"user-agent": user_agent} + + # Check if file already exists + if os.path.isfile(filename): + if args.debug: + print(f"> {filename} already downloaded.") + else: + print(f"> Downloading {filename}...") + + response = requests.get(row["url"], headers=request_headers, stream=True) + if response.status_code == 200: + with open(filename, "wb") as fd: + for chunk in response: + fd.write(chunk) + else: + print( + Fore.RED + + f" > Download failed (HTTP {response.status_code})" + + Fore.RESET + ) + + return False + + return True + + +def upload_file(item_id: str, filename: str, item_type: str): + """Upload a file to an existing item in the DSpace repository. + + Equivalent to the following request with httpie or curl: + + http POST \ + 'http://localhost:8080/rest/items/21c0db9d-6c35-4111-9ca1-2c1345f44e40/bitstreams?name=file.pdf&description=Book' \ + Cookie:JSESSIONID=0BDB219712F4F7DDB6055C1906F3E24B < file.pdf + + This will upload the bitstream into the item's ORIGINAL bundle. + + TODO: parameterize the bundle name so that we could upload a bunch of thumbnails. + + :param item_id: UUID of item to post the file to. + :param filename: Name of the file to upload (must exist in the same directory as the script). + :param item_type: Type of the item, to be used for the bitstream description. + :returns: bool + """ + + try: + # Open the file + file = open(filename, "rb") + except FileNotFoundError: + sys.stderr.write(Fore.RED + f" Could not open {filename}\n" + Fore.RESET) + + request_url = f"{rest_items_endpoint}/{item_id}/bitstreams" + headers = {"user-agent": user_agent} + cookies = {"JSESSIONID": jsessionid} + request_params = {"name": filename, "description": item_type} + + try: + request = requests.post( + request_url, + headers=headers, + cookies=cookies, + params=request_params, + files={"file": file}, + ) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + f" Could not connect to REST API: {request_url}\n" + Fore.RESET + ) + + exit(1) + + if request.status_code == requests.codes.ok: + file.close() + + return True + else: + print(Fore.RED + f" Error uploading file: {filename}" + Fore.RESET) + file.close() + + return False + + +parser = argparse.ArgumentParser( + description="Download files and post them to existing items in a DSpace 6.x repository." +) +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-u", + "--rest-url", + help="URL of the DSpace 6.x REST API.", + default="http://localhost:8080/rest", +) +parser.add_argument("-e", "--user", help="Email address of administrator user.") +parser.add_argument("-p", "--password", help="Password of administrator user.") +parser.add_argument( + "-i", + "--csv-file", + help="Path to CSV file", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument( + "-s", "--jsessionid", help="JESSIONID, if previously authenticated." +) +parser.add_argument( + "-w", "--download-only", help="Only download the files.", action="store_true" +) +args = parser.parse_args() + +# DSpace 6.x REST API base URL and endpoints +rest_base_url = args.rest_url +rest_login_endpoint = f"{rest_base_url}/login" +rest_status_endpoint = f"{rest_base_url}/status" +rest_items_endpoint = f"{rest_base_url}/items" +user_agent = "Alan Orth (ILRI) Python bot" + +# Set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# If the user passed a session ID then we should check if it is valid first. +# Otherwise we should login and get a new session. If the user requested for +# download only mode then we skip authentication checks. +if args.jsessionid and not args.download_only: + if check_session(args.jsessionid): + jsessionid = args.jsessionid + else: + jsessionid = login(args.user, args.password) +elif not args.download_only: + jsessionid = login(args.user, args.password) + +if args.debug: + sys.stderr.write(f"Opening {args.csv_file.name}\n") + +try: + # Open the CSV + reader = csv.DictReader(args.csv_file) +except FileNotFoundError: + sys.stderr.write(Fore.RED + f" Could not open {args.csv_file.name}\n" + Fore.RESET) + +# Check if the item ID and URL fields exist in the CSV +for field in ["id", "url"]: + if field not in reader.fieldnames: + sys.stderr.write( + Fore.RED + + f"Expected field {field} does not exist in the CSV.\n" + + Fore.RESET + ) + sys.exit(1) + +for row in reader: + if download_file(row["url"]): + if not args.download_only: + check_item(row) diff --git a/ilri/resolve_addresses.py b/ilri/resolve_addresses.py new file mode 100755 index 000000000000..915bc3586fb7 --- /dev/null +++ b/ilri/resolve_addresses.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +# +# resolve-addresses.py 0.4.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the IPAPI.co API for information about IP addresses read from a text +# file. The text file should have one address per line (comments and invalid +# lines are skipped). Optionally looks up IPs in the AbuseIPDB.com if you pro- +# vide an API key. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install requests requests-cache colorama +# + +import argparse +import csv +import ipaddress +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +def valid_ip(address): + try: + ipaddress.ip_address(address) + + return True + + except ValueError: + return False + + +# read IPs from a text file, one per line +def read_addresses_from_file(): + # initialize an empty list for IP addresses + addresses = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # skip any lines that aren't valid IPs + if not valid_ip(line): + continue + + # iterate over results and add addresses that aren't already present + if line not in addresses: + addresses.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_addresses(addresses) + + +def resolve_addresses(addresses): + if args.abuseipdb_api_key: + fieldnames = ["ip", "org", "asn", "country", "abuseConfidenceScore"] + else: + fieldnames = ["ip", "org", "asn", "country"] + + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + # cache HTTP 200 responses + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + # iterate through our addresses + for address in addresses: + print(f"Looking up {address} in IPAPI") + + # build IPAPI request URL for current address + request_url = f"https://ipapi.co/{address}/json" + + request = requests.get(request_url) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + address_org = data["org"] + address_asn = data["asn"] + address_country = data["country"] + + row = { + "ip": address, + "org": address_org, + "asn": address_asn, + "country": address_country, + } + + if args.abuseipdb_api_key: + print(f"→ Looking up {address} in AbuseIPDB") + + # build AbuseIPDB.com request URL for current address + # see: https://docs.abuseipdb.com/#check-endpoint + request_url = "https://api.abuseipdb.com/api/v2/check" + request_headers = {"Key": args.abuseipdb_api_key} + request_params = {"ipAddress": address, "maxAgeInDays": 90} + + request = requests.get( + request_url, headers=request_headers, params=request_params + ) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "→ Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + abuseConfidenceScore = data["data"]["abuseConfidenceScore"] + + print(f"→ {address} has score: {abuseConfidenceScore}") + + row.update({"abuseConfidenceScore": abuseConfidenceScore}) + + writer.writerow(row) + + # check if we hit IPAPI's rate limit + elif request.status_code == 429: + sys.stderr.write(Fore.RED + "Error: hit IPAPI rate limit.\n" + Fore.RESET) + exit(1) + # if request status not 200 OK + else: + sys.stderr.write( + Fore.RED + + f"Error: request failed ({request.status_code}).\n" + + Fore.RESET + ) + exit(1) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the public IPAPI.co API for information associated with a list of IP addresses from a text file." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing IP addresses to resolve.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-k", + "--abuseipdb-api-key", + help="AbuseIPDB.com API key if you want to check whether IPs have been reported.", +) +parser.add_argument( + "-o", + "--output-file", + help="File name to save CSV output.", + required=True, + type=argparse.FileType("w"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +read_addresses_from_file() + +exit() diff --git a/ilri/resolve_addresses_geoip2.py b/ilri/resolve_addresses_geoip2.py new file mode 100755 index 000000000000..5f3cb038cc3b --- /dev/null +++ b/ilri/resolve_addresses_geoip2.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# +# resolve-addresses-geoip2.py 0.0.2 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the local GeoIP DB for information about IP addresses read from a text +# file. The text file should have one address per line (comments and invalid li- +# nes are skipped). Consults GreyNoise to see if an IP address is known, and can +# optionally look up IPs in the AbuseIPDB.com if you provide an API key. GeoIP +# databases are expected to be here: +# +# - /var/lib/GeoIP/GeoLite2-City.mmdb +# - /var/lib/GeoIP/GeoLite2-ASN.mmdb +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install requests requests-cache colorama geoip2 +# + +import argparse +import csv +import ipaddress +import signal +import sys +from datetime import timedelta + +import geoip2.database +import requests +import requests_cache +from colorama import Fore + + +def valid_ip(address): + try: + ipaddress.ip_address(address) + + return True + + except ValueError: + return False + + +# read IPs from a text file, one per line +def read_addresses_from_file(): + # initialize an empty list for IP addresses + addresses = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # skip any lines that aren't valid IPs + if not valid_ip(line): + continue + + # iterate over results and add addresses that aren't already present + if line not in addresses: + addresses.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_addresses(addresses) + + +def resolve_addresses(addresses): + if args.abuseipdb_api_key: + fieldnames = [ + "ip", + "org", + "network", + "asn", + "country", + "greyNoiseClassification", + "abuseConfidenceScore", + ] + else: + fieldnames = [ + "ip", + "org", + "network", + "asn", + "country", + "greyNoiseClassification", + ] + + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + # cache HTTP 200 responses + requests_cache.install_cache( + "requests-cache", + expire_after=expire_after, + allowable_codes=(200, 404), + ) + + # prune old cache entries + requests_cache.delete() + + # iterate through our addresses + for address in addresses: + print(f"Looking up {address} in GeoIP2") + + # Look up IP information in the City database + with geoip2.database.Reader("/var/lib/GeoIP/GeoLite2-City.mmdb") as reader: + try: + response = reader.city(address) + + address_country = response.country.iso_code + except geoip2.errors.AddressNotFoundError: + pass + + # Look up organization information in the ASN database + with geoip2.database.Reader("/var/lib/GeoIP/GeoLite2-ASN.mmdb") as reader: + try: + response = reader.asn(address) + + address_org = response.autonomous_system_organization + address_net = response.network + address_asn = response.autonomous_system_number + except geoip2.errors.AddressNotFoundError: + if args.debug: + sys.stderr.write( + Fore.YELLOW + "→ IP not in database.\n" + Fore.RESET + ) + + pass + + row = { + "ip": address, + "org": address_org, + "network": address_net, + "asn": address_asn, + "country": address_country, + } + + # Only look up IPv4 addresses in GreyNoise + if isinstance(ipaddress.ip_address(address), ipaddress.IPv4Address): + print(f"→ Looking up {address} in GreyNoise") + + # build greynoise.io request URL for current address + # see: https://docs.greynoise.io/reference/get_v3-community-ip + request_url = f"https://api.greynoise.io/v3/community/{address}" + request_headers = {"Accept": "application/json"} + + request = requests.get(request_url, headers=request_headers) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "→ Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + greyNoiseClassification = data["classification"] + + print(f"→ {address} has classification: {greyNoiseClassification}") + else: + # GreyNoise has not seen this address, so let's just say unknown + greyNoiseClassification = "unknown" + + row.update({"greyNoiseClassification": greyNoiseClassification}) + + if args.abuseipdb_api_key: + print(f"→ Looking up {address} in AbuseIPDB") + + # build AbuseIPDB.com request URL for current address + # see: https://docs.abuseipdb.com/#check-endpoint + request_url = "https://api.abuseipdb.com/api/v2/check" + request_headers = {"Key": args.abuseipdb_api_key} + request_params = {"ipAddress": address, "maxAgeInDays": 90} + + request = requests.get( + request_url, headers=request_headers, params=request_params + ) + + if args.debug and request.from_cache: + sys.stderr.write(Fore.GREEN + "→ Request in cache.\n" + Fore.RESET) + + # if request status 200 OK + if request.status_code == requests.codes.ok: + data = request.json() + + abuseConfidenceScore = data["data"]["abuseConfidenceScore"] + + print(f"→ {address} has score: {abuseConfidenceScore}") + + row.update({"abuseConfidenceScore": abuseConfidenceScore}) + + writer.writerow(row) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the public GeoIP2 database for information associated with a list of IP addresses from a text file." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing IP addresses to resolve.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-k", + "--abuseipdb-api-key", + help="AbuseIPDB.com API key if you want to check whether IPs have been reported.", +) +parser.add_argument( + "-o", + "--output-file", + help="File name to save CSV output.", + required=True, + type=argparse.FileType("w"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +read_addresses_from_file() + +exit() diff --git a/ilri/resolve_orcids.py b/ilri/resolve_orcids.py new file mode 100755 index 000000000000..2b3403903a2c --- /dev/null +++ b/ilri/resolve_orcids.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +# +# resolve-orcids.py 1.2.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public ORCID API for names associated with a list of ORCID iDs +# read from a text file or DSpace authority Solr core. Text file should have +# one ORCID identifier per line (comments and invalid lines are skipped). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import logging +import re +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +# read ORCID identifiers from a text file, one per line +def read_identifiers_from_file(): + # initialize an empty list for ORCID iDs + orcids = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # regular expression for matching exactly one ORCID identifier on a line + pattern = re.compile(r"^[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}$") + + # skip the line if it doesn't match the pattern + if not pattern.match(line): + continue + + # iterate over results and add ORCID iDs that aren't already in the list + if line not in orcids: + orcids.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_orcid_identifiers(orcids) + + +# query DSpace's authority Solr core for ORCID identifiers +def read_identifiers_from_solr(): + # simple query from the 'authority' collection 2000 rows at a time (default is 1000) + solr_query_params = {"q": "orcid_id:*", "wt": "json", "rows": 2000} + + solr_url = args.solr_url + "/authority/select" + + res = requests.get(solr_url, params=solr_query_params) + + numFound = res.json()["response"]["numFound"] + logger.debug( + Fore.GREEN + + f"Total number of Solr records with ORCID iDs: {numFound}" + + Fore.RESET + ) + + # initialize an empty list for ORCID iDs + orcids = [] + + docs = res.json()["response"]["docs"] + # iterate over results and add ORCID iDs that aren't already in the list + # for example, we had 1600 ORCID iDs in Solr, but only 600 are unique + for doc in docs: + if doc["orcid_id"] not in orcids: + orcids.append(doc["orcid_id"]) + + # if the user requested --extract-only, write the current ORCID iD to output_file + if args.extract_only: + line = doc["orcid_id"] + "\n" + args.output_file.write(line) + + # exit now if the user requested --extract-only + if args.extract_only: + orcids_extracted = str(len(orcids)) + logger.debug( + Fore.GREEN + + f"Number of unique ORCID identifiers: {orcids_extracted}" + + Fore.RESET + ) + # close output file before we exit + args.output_file.close() + exit() + + resolve_orcid_identifiers(orcids) + + +# Query ORCID's public API for names associated with identifiers. Prefers to use +# the "credit-name" field if it is present, otherwise will default to using the +# "given-names" and "family-name" fields. +def resolve_orcid_identifiers(orcids): + unique_orcids = str(len(orcids)) + logger.debug( + Fore.GREEN + + f"Resolving names associated with {unique_orcids} unique ORCID identifiers.\n" + + Fore.RESET + ) + + # ORCID API endpoint, see: https://pub.orcid.org + orcid_api_base_url = "https://pub.orcid.org/v2.1/" + orcid_api_endpoint = "/person" + + # enable transparent request cache with thirty-day expiry + expire_after = timedelta(days=30) + # cache HTTP 200 and 404 responses, because ORCID uses HTTP 404 when an identifier doesn't exist + requests_cache.install_cache( + "requests-cache", expire_after=expire_after, allowable_codes=(200, 404) + ) + + # prune old cache entries + requests_cache.delete() + + # iterate through our ORCID iDs and fetch their names from the ORCID API + for orcid in orcids: + logger.debug( + Fore.GREEN + + f"Looking up the names associated with ORCID iD: {orcid}" + + Fore.RESET + ) + + # build request URL for current ORCID ID + request_url = orcid_api_base_url + orcid.strip() + orcid_api_endpoint + + # ORCID's API defaults to some custom format, so tell it to give us JSON + request = requests.get(request_url, headers={"Accept": "application/json"}) + + # Check the request status + if request.status_code == requests.codes.ok: + # read response JSON into data + data = request.json() + + # make sure name element is not null + if data["name"]: + # prefer to use credit-name if present and not blank + if ( + data["name"]["credit-name"] + and data["name"]["credit-name"]["value"] != "" + ): + line = data["name"]["credit-name"]["value"] + # otherwise try to use given-names and or family-name + else: + # make sure given-names is present and not deactivated + if ( + data["name"]["given-names"] + and data["name"]["given-names"]["value"] + != "Given Names Deactivated" + ): + line = data["name"]["given-names"]["value"] + else: + logger.debug( + Fore.YELLOW + + "Ignoring null or deactivated given-names element." + + Fore.RESET + ) + # make sure family-name is present and not deactivated + if ( + data["name"]["family-name"] + and data["name"]["family-name"]["value"] + != "Family Name Deactivated" + ): + line = f'{line} {data["name"]["family-name"]["value"]}' + else: + logger.debug( + Fore.YELLOW + + "Ignoring null or deactivated family-name element." + + Fore.RESET + ) + # check if line has something (a credit-name, given-names, and or family-name) + if line and line != "": + line = "{0}: {1}".format(line.strip(), orcid) + else: + logger.debug( + Fore.RED + + "Skipping identifier with no valid name elements." + + Fore.RESET + ) + + continue + + if not args.quiet: + logger.info(line) + + # write formatted name and ORCID identifier to output file + args.output_file.write(f"{line}\n") + + # clear line for next iteration + line = None + else: + logger.debug( + Fore.YELLOW + + "Skipping identifier with null name element." + + Fore.RESET + ) + # HTTP 404 means that the API url or identifier was not found. If the + # API URL is correct, let's assume that the identifier was not found. + elif request.status_code == 404: + logger.debug( + Fore.YELLOW + + "Skipping missing identifier (API request returned HTTP 404)." + + Fore.RESET + ) + + continue + # HTTP 409 means that the identifier is locked for some reason + # See: https://members.orcid.org/api/resources/error-codes + elif request.status_code == 409: + logger.debug( + Fore.YELLOW + + "Skipping locked identifier (API request returned HTTP 409)." + + Fore.RESET + ) + + continue + else: + logger.error(Fore.RED + "Request failed." + Fore.RESET) + # close output file before we exit + args.output_file.close() + sys.exit(1) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description='Query the public ORCID API for names associated with a list of ORCID identifiers, either from a text file or a DSpace authority Solr core. Optional "extract only" mode will simply fetch the ORCID identifiers from Solr and write them to the output file without resolving their names from ORCID\'s API.' +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-e", + "--extract-only", + help="If fetching ORCID identifiers from Solr, write them to the output file without resolving their names from the ORCID API.", + action="store_true", +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print results to screen as we find them (results will still go to output file).", + action="store_true", +) +# group of mutually exclusive options +group = parser.add_mutually_exclusive_group(required=True) +group.add_argument( + "-i", + "--input-file", + help="File name containing ORCID identifiers to resolve.", + type=argparse.FileType("r"), +) +group.add_argument( + "-s", + "--solr-url", + help="URL of Solr application (for example: http://localhost:8080/solr).", +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the ORCID identifiers from there +if args.input_file: + read_identifiers_from_file() +# otherwise, get the ORCID identifiers from Solr +elif args.solr_url: + read_identifiers_from_solr() diff --git a/ilri/rest_find_collections.py b/ilri/rest_find_collections.py new file mode 100755 index 000000000000..092f4524145f --- /dev/null +++ b/ilri/rest_find_collections.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# +# rest-find-collections.py 1.1.3 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A quick and dirty example for parsing the DSpace REST API to find and print +# the names of all collections contained in a community hierarchy. It expects +# exactly one command line argument: the handle of a community. For example: +# +# $ ./rest-find-collections.py 10568/1 +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install requests colorama +# +# See: https://requests.readthedocs.org/en/master + +import argparse +import signal +import sys + +import requests +from colorama import Fore + + +def signal_handler(signal, frame): + sys.exit(1) + + +def parse_community(community_id): + request_url = ( + rest_base_url + + rest_communities_endpoint + + str(community_id) + + "?expand=collections,subCommunities" + ) + try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) + except requests.ConnectionError: + sys.stderr.write( + Fore.RED + "Could not connect to {0}.\n".format(args.rest_url) + Fore.RESET + ) + exit(1) + + if request.status_code == requests.codes.ok: + subcommunities = request.json()["subcommunities"] + collections = request.json()["collections"] + + for subcommunity in subcommunities: + subcommunity_id = subcommunity["uuid"] + + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found subcommunity (id: {subcommunity_id}, handle: {subcommunity_handle}): {subcommunity_name} ==> I must go deeper!\n".format( + subcommunity_id=str(subcommunity_id), + subcommunity_handle=subcommunity["handle"], + subcommunity_name=subcommunity["name"], + ) + + Fore.RESET + ) + + parse_community(subcommunity_id) + + for collection in collections: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + "Found collection (id: {collection_id}, handle: {collection_handle}): {collection_name}\n".format( + collection_id=str(collection["uuid"]), + collection_handle=collection["handle"], + collection_name=collection["name"], + ) + + Fore.RESET + ) + + all_collections.append(collection["name"]) + else: + sys.stderr.write( + Fore.RED + + "Status not ok! Request URL was: {request_url}\n".format( + request_url=request.url + ) + + Fore.RESET + ) + exit(1) + + +parser = argparse.ArgumentParser( + description="Find all collections under a given DSpace community." +) +parser.add_argument("community", help="Community to process, for example: 10568/1") +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-u", + "--rest-url", + help="URL of DSpace REST application.", + default="http://localhost:8080/rest", +) +args = parser.parse_args() + +handle = args.community + +# REST base URL and endpoints (with leading and trailing slashes) +rest_base_url = args.rest_url +rest_handle_endpoint = "/handle/" +rest_communities_endpoint = "/communities/" +rest_collections_endpoint = "/collections/" +rest_user_agent = "Alan Test Python Requests Bot" + +# initialize empty list of all collections +all_collections = [] + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# fetch the metadata for the given handle +request_url = rest_base_url + rest_handle_endpoint + str(handle) + +try: + request = requests.get(request_url, headers={"user-agent": rest_user_agent}) +except requests.ConnectionError: + sys.stderr.write( + Fore.RED + + "Could not connect to REST API: {0}.\n".format(args.rest_url) + + Fore.RESET + ) + exit(1) + +# Check the request status +if request.status_code == requests.codes.ok: + handle_type = request.json()["type"] + + # Make sure the given handle is a community + if handle_type == "community": + community_id = request.json()["uuid"] + parse_community(community_id) + + for collection in all_collections: + print( + Fore.GREEN + + "Name of collection: {collection}".format(collection=collection) + + Fore.RESET + ) + else: + sys.stderr.write( + Fore.RED + + '{handle} is type "{handle_type}", not community.\n'.format( + handle=handle, handle_type=handle_type + ) + + Fore.RESET + ) + exit(1) +else: + sys.stderr.write( + Fore.RED + + "Request failed. Are you sure {handle} is a valid handle?\n".format( + handle=handle + ) + + Fore.RESET + ) + exit(1) diff --git a/ilri/ror_lookup.py b/ilri/ror_lookup.py new file mode 100755 index 000000000000..776354515600 --- /dev/null +++ b/ilri/ror_lookup.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# +# ror-lookup.py 0.1.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the Research Organization Registry dataset for organizations read +# from a text file. Text file should have one organization per line. Results +# are saved to a CSV including the organization and whether it matched or not. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama +# + +import argparse +import csv +import json +import logging +import signal +import sys + +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + + +# read organizations from a text file, one per line +def read_organizations_from_file(): + # initialize an empty list for organization + organizations = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add organization that aren't already present + if line not in organizations: + organizations.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_organizations(organizations) + + +def resolve_organizations(organizations): + fieldnames = ["organization", "match type", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for organization in organizations: + logger.debug(f"Looking up the organization: {organization}") + + # check for exact match + if organization.lower() in ror_names: + logger.info( + f"{Fore.GREEN}Name match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "name", + "matched": "true", + } + ) + elif organization.lower() in ror_aliases: + logger.info( + f"{Fore.GREEN}Alias match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "alias", + "matched": "true", + } + ) + elif organization.lower() in ror_acronyms: + logger.info( + f"{Fore.GREEN}Acronym match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "acronym", + "matched": "true", + } + ) + else: + logger.debug( + f"{Fore.YELLOW}No match for {organization!r} in ROR{Fore.RESET}" + ) + + writer.writerow( + { + "organization": organization, + "match type": "", + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the ROR JSON to validate organizations from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Set log level to DEBUG.", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing organizations to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-r", + "--ror-json", + help="ror.json file containing organizations to look up. See: https://doi.org/10.6084/m9.figshare.c.4596503.v5", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# if the user specified an input file, get the organizations from there +if args.input_file and args.ror_json: + ror = json.load(args.ror_json) + + # list comprehension instead of a for loop to extract all names + ror_names = [org["name"].lower() for org in ror] + + # nested list comprehension to extract aliases, think of it like: + # ror_aliases_all = [] + # for org in ror: + # for alias in org['aliases']: + # ror_aliases_all.append(alias) + # + # See: https://stackoverflow.com/questions/18072759/list-comprehension-on-a-nested-list + ror_aliases_all = [alias.lower() for org in ror for alias in org["aliases"]] + # dedupe the list by converting it to a dict and back to a list (dicts can't + # have any duplicate items) + ror_aliases = list(dict.fromkeys(ror_aliases_all)) + # delete the list of all aliases + del ror_aliases_all + + # same for acronyms + ror_acronyms_all = [acronym.lower() for org in ror for acronym in org["acronyms"]] + ror_acronyms = list(dict.fromkeys(ror_acronyms_all)) + del ror_acronyms_all + + read_organizations_from_file() + +exit() diff --git a/ilri/sherpa_issn_lookup.py b/ilri/sherpa_issn_lookup.py new file mode 100755 index 000000000000..524190711159 --- /dev/null +++ b/ilri/sherpa_issn_lookup.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# +# sherpa-issn-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the public Sherpa API for journal titles using ISSNs read from a +# text file. The text file should have one ISSN per line. +# +# See: https://v2.sherpa.ac.uk/api/object-retrieval-by-id.html +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama requests requests-cache +# + +import argparse +import csv +import signal +import sys +from datetime import timedelta + +import requests +import requests_cache +from colorama import Fore + + +# read journals from a text file, one per line +def read_issns_from_file(): + # initialize an empty list for ISSNs + issns = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add ISSNs that aren't already present + if line not in issns: + issns.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_issns(issns) + + +def resolve_issns(issns): + fieldnames = ["issn", "journal title"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + # enable transparent request cache with two weeks expiry + expire_after = timedelta(days=14) + requests_cache.install_cache("requests-cache", expire_after=expire_after) + + # prune old cache entries + requests_cache.delete() + + for issn in issns: + if args.debug: + sys.stderr.write(Fore.GREEN + f"Looking up ISSN: {issn}\n" + Fore.RESET) + + request_url = "https://v2.sherpa.ac.uk/cgi/retrieve_by_id" + request_params = { + "item-type": "publication", + "format": "Json", + "api-key": args.api_key, + "identifier": issn, + } + + try: + request = requests.get(request_url, params=request_params) + + data = request.json() + except requests.exceptions.ConnectionError: + sys.stderr.write(Fore.RED + "Connection error.\n" + Fore.RESET) + + # CrossRef responds 404 if a journal isn't found, so we check for an + # HTTP 2xx response here + if request.status_code == requests.codes.ok and len(data["items"]) == 1: + print(f"Exact match for {issn} in Sherpa (cached: {request.from_cache})") + + writer.writerow( + {"issn": issn, "journal title": data["items"][0]["title"][0]["title"]} + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f"No match for {issn} in Sherpa (cached: {request.from_cache})\n" + + Fore.RESET + ) + + writer.writerow({"issn": issn, "journal title": ""}) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query the Crossref REST API to validate ISSNs from a text file." +) +parser.add_argument( + "-a", + "--api-key", + help="Sherpa API KEY.", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing ISSNs to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file (CSV) to write results to.", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# if the user specified an input file, get the ISSNs from there +if args.input_file: + read_issns_from_file() + +exit() diff --git a/ilri/subdivision_lookup.py b/ilri/subdivision_lookup.py new file mode 100755 index 000000000000..0591ac29ed1c --- /dev/null +++ b/ilri/subdivision_lookup.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# subdivision-lookup.py 0.0.1 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Queries the pycountry ISO 3166-2 dataset for subdivisions read from a text +# file. Text file should have one subdivision per line. Results are saved to +# a CSV including the subdivision and whether it matched or not. +# +# This script is written for Python 3.6+ and requires several modules that you +# can install with pip (I recommend using a Python virtual environment): +# +# $ pip install colorama pycountry +# + +import argparse +import csv +import signal +import sys + +import pycountry +from colorama import Fore + + +# read subdivisions from a text file, one per line +def read_subdivisions_from_file(): + # initialize an empty list for subdivisions + subdivisions = [] + + for line in args.input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # iterate over results and add subdivisions that aren't already present + if line not in subdivisions: + subdivisions.append(line) + + # close input file before we exit + args.input_file.close() + + resolve_subdivisions(subdivisions) + + +def resolve_subdivisions(subdivisions): + fieldnames = ["subdivision", "matched"] + writer = csv.DictWriter(args.output_file, fieldnames=fieldnames) + writer.writeheader() + + for subdivision in subdivisions: + if args.debug: + sys.stderr.write( + Fore.GREEN + f"Looking up the subdivision: {subdivision}\n" + Fore.RESET + ) + + # check for exact match + if subdivision.lower() in subdivision_names: + print(f"Match for {subdivision!r}") + + writer.writerow( + { + "subdivision": subdivision, + "matched": "true", + } + ) + else: + if args.debug: + sys.stderr.write( + Fore.YELLOW + f"No match for {subdivision!r}\n" + Fore.RESET + ) + + writer.writerow( + { + "subdivision": subdivision, + "matched": "false", + } + ) + + # close output file before we exit + args.output_file.close() + + +def signal_handler(signal, frame): + # close output file before we exit + args.output_file.close() + + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Query pycountry's ISO 3166-2 list to validate subdivisions from a text file and save results in a CSV." +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="File name containing subdivisions to look up.", + required=True, + type=argparse.FileType("r"), +) +parser.add_argument( + "-o", + "--output-file", + help="Name of output file to write results to (CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), +) +args = parser.parse_args() + +# set the signal handler for SIGINT (^C) so we can exit cleanly +signal.signal(signal.SIGINT, signal_handler) + +# list comprehension instead of a for loop to extract all subdivision names +subdivision_names = [subdivision.name.lower() for subdivision in pycountry.subdivisions] + +read_subdivisions_from_file() + +exit() diff --git a/ilri/update_orcids.py b/ilri/update_orcids.py new file mode 100755 index 000000000000..63248abb10dd --- /dev/null +++ b/ilri/update_orcids.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# +# update-orcids.py v0.1.5 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Expects a text file with author names and ORCID identifiers in the following +# format: +# +# Jose Polania: 0000-0002-1186-0503 +# Joseph Fargione: 0000-0002-0636-5380 +# Joseph M. Sandro: 0000-0002-8311-2299 +# +# Will check existing ORCID metadata to make sure they use the author's latest +# name format. +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install colorama +# + +import argparse +import logging +import re +import signal +import sys + +import util +from colorama import Fore + +# Create a local logger instance +logger = logging.getLogger(__name__) + + +def signal_handler(signal, frame): + sys.exit(1) + + +parser = argparse.ArgumentParser( + description="Update ORCID records in the DSpace PostgreSQL database." +) +parser.add_argument( + "-i", + "--input-file", + help='Path to input file containing ORCIDs in format "Alan S. Orth: 0000-0002-1735-7458".', + required=True, + type=argparse.FileType("r", encoding="UTF-8"), +) +parser.add_argument("-db", "--database-name", help="Database name", required=True) +parser.add_argument("-u", "--database-user", help="Database username", required=True) +parser.add_argument("-p", "--database-pass", help="Database password", required=True) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages to standard error (stderr).", + action="store_true", +) +parser.add_argument( + "-n", + "--dry-run", + help="Only print changes that would be made.", + action="store_true", +) +parser.add_argument( + "-q", + "--quiet", + help="Do not print progress messages to the screen.", + action="store_true", +) +args = parser.parse_args() + +# The default log level is WARNING, but we want to set it to DEBUG or INFO +if args.debug: + logger.setLevel(logging.DEBUG) +else: + logger.setLevel(logging.INFO) + +# Set the global log format +logging.basicConfig(format="[%(levelname)s] %(message)s") + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# connect to database +conn = util.db_connect( + args.database_name, args.database_user, args.database_pass, "localhost" +) + +if args.dry_run: + conn.read_only = True + +cursor = conn.cursor() + +# Use read().splitlines() so we don't get newlines after each line, though I'm +# not sure if we should also be stripping? +for line in args.input_file.read().splitlines(): + # extract the ORCID identifier from the current line + orcid_identifier_pattern = re.compile( + r"[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}" + ) + orcid_identifier_match = orcid_identifier_pattern.search(line) + + # sanity check to make sure we extracted the ORCID identifier + if orcid_identifier_match is None: + if args.debug: + sys.stderr.write( + Fore.YELLOW + + f'Skipping invalid ORCID identifier in "{line}".\n' + + Fore.RESET + ) + continue + + # we only expect one ORCID identifier, so if it matches it will be group "0" + # see: https://docs.python.org/3/library/re.html + orcid_identifier = orcid_identifier_match.group(0) + + metadata_field_id = util.field_name_to_field_id(cursor, "cg.creator.identifier") + + # note that the SQL here is quoted differently to allow us to use + # LIKE with % wildcards with our paremeter subsitution + sql = "SELECT text_value, dspace_object_id FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value LIKE '%%' || %s || '%%' AND text_value!=%s" + cursor.execute(sql, (metadata_field_id, orcid_identifier, line)) + + # Get the records for items with matching metadata. We will use the + # object IDs to update their last_modified dates. + matching_records = cursor.fetchall() + + if args.dry_run: + if cursor.rowcount > 0 and not args.quiet: + logger.info( + Fore.GREEN + + f"(DRY RUN) Fixed {cursor.rowcount} occurences of: {line}" + + Fore.RESET + ) + else: + sql = "UPDATE metadatavalue SET text_value=%s WHERE dspace_object_id IN (SELECT uuid FROM item WHERE in_archive AND NOT withdrawn) AND metadata_field_id=%s AND text_value LIKE '%%' || %s || '%%' AND text_value!=%s" + cursor.execute( + sql, + ( + line, + metadata_field_id, + orcid_identifier, + line, + ), + ) + + if cursor.rowcount > 0 and not args.quiet: + logger.info( + Fore.GREEN + + f"Fixed {cursor.rowcount} occurences of: {line}" + + Fore.RESET + ) + + # Update the last_modified date for each item we've changed + for record in matching_records: + util.update_item_last_modified(cursor, record[1]) + + +# commit changes when we're done +if not args.dry_run: + conn.commit() + +# close database connection before we exit +conn.close() + +# close input file +args.input_file.close() + +sys.exit(0) diff --git a/ilri/util.py b/ilri/util.py new file mode 100644 index 000000000000..edc6a36cdf1a --- /dev/null +++ b/ilri/util.py @@ -0,0 +1,159 @@ +# util.py v0.0.5 +# +# Copyright Alan Orth. +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# Various helper functions for CGSpace DSpace Python scripts. +# + +import gzip +import os +import re +import shutil +import sys + +import psycopg +import requests +import requests_cache +from colorama import Fore + + +def field_name_to_field_id(cursor, metadata_field: str): + """Return the metadata_field_id for a given metadata field. + + TODO: handle case where schema doesn't exist + TODO: handle case where metadata field doesn't exist + + :param cursor: a psycopg cursor with an active database session. + :param metadata_field: the metadata field, for example "dcterms.title". + :returns int + """ + + if len(metadata_field.split(".")) == 3: + schema, element, qualifier = metadata_field.split(".") + elif len(metadata_field.split(".")) == 2: + schema, element = metadata_field.split(".") + qualifier = None + + # First we need to get the schema ID + sql = "SELECT metadata_schema_id FROM metadataschemaregistry WHERE short_id=%s;" + # Syntax looks weird here, but the second argument must always be a sequence + # See: https://www.psycopg.org/docs/usage.html + cursor.execute(sql, [schema]) + + if cursor.rowcount > 0: + metadata_schema_id = cursor.fetchone()[0] + + # Now we can get the metadata field ID, paying attention to whether the + # field has a qualifier or not. + if qualifier: + sql = "SELECT metadata_field_id FROM metadatafieldregistry WHERE metadata_schema_id=%s AND element=%s AND qualifier=%s;" + cursor.execute(sql, [metadata_schema_id, element, qualifier]) + else: + sql = "SELECT metadata_field_id FROM metadatafieldregistry WHERE metadata_schema_id=%s AND element=%s" + cursor.execute(sql, [metadata_schema_id, element]) + + if cursor.rowcount > 0: + metadata_field_id = cursor.fetchone()[0] + + return metadata_field_id + + +def update_item_last_modified(cursor, dspace_object_id: str): + """Update an item's last_modified timestamp. + + :param cursor: a psycopg cursor with an active database session. + :param dspace_object_id: dspace_object_id of the item to update. + """ + + sql = "UPDATE item SET last_modified=NOW() WHERE uuid=%s;" + # Syntax looks weird here, but the second argument must always be a sequence + # See: https://www.psycopg.org/docs/usage.html + cursor.execute(sql, [dspace_object_id]) + + +def db_connect( + database_name: str, database_user: str, database_pass: str, database_host: str +): + """Connect to a PostgreSQL database. + + :param database_name: a string containing the database name. + :param database_user: a string containing the database user. + :param database_pass: a string containing the database pass. + :param database_host: a string containing the database host. + :returns psycopg connection + """ + + try: + conn = psycopg.connect( + f"dbname={database_name} user={database_user} password={database_pass} host={database_host}" + ) + except psycopg.OperationalError: + sys.stderr.write(Fore.RED + "Could not connect to database.\n" + Fore.RESET) + sys.exit(1) + + return conn + + +def read_dois_from_file(input_file) -> list: + """Read DOIs from a file. + + DOIs should be one per line with either http, https, dx.doi.org, doig.org + or just the DOI itself. Anything other than the DOI will be stripped. + + :param input_file: a file handle (class _io.TextIOWrapper ???). + :returns list of DOIs + """ + + # initialize an empty list for DOIs + dois = [] + + for line in input_file: + # trim any leading or trailing whitespace (including newlines) + line = line.strip() + + # trim http://, https://, etc to make sure we only get the DOI component + line = re.sub(r"^https?://(dx\.)?doi\.org/", "", line) + + # iterate over results and add DOIs that aren't already present + if line not in dois: + dois.append(line) + + # close input file before we exit + input_file.close() + + return dois + + +def download_file(url, filename) -> bool: + # Disable cache for streaming downloads + # See: https://github.com/requests-cache/requests-cache/issues/75 + with requests_cache.disabled(): + r = requests.get(url, stream=True, allow_redirects=True) + + # Download failed for some reason + if not r.ok: + return False + + with open(filename, "wb") as f: + # Make sure we handle zipped content. Note: this is not transport + # compression, which is handled automatically by requests. + try: + content_encoding = r.headers["Content-Encoding"] + except KeyError: + content_encoding = None + + if content_encoding == "gzip": + gzip_file = gzip.GzipFile(fileobj=r.raw) + shutil.copyfileobj(gzip_file, f) + else: + shutil.copyfileobj(r.raw, f) + + # Check whether the file was written to disk after downloading + if os.path.isfile(filename): + return True + else: + return False