From 3cc9844e070a4fcd9548eb5c3f468b07919195f4 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Tue, 11 Jun 2024 17:35:11 +0530 Subject: [PATCH 1/9] modified resource listing --- .vscode/c_cpp_properties.json | 18 +++++++++ .vscode/settings.json | 62 +++++++++++++++++++++++++++++ main.py | 21 ++++++++-- objects.py | 5 ++- requirements.txt | 1 + sources/eudat.py | 2 +- sources/gesis.py | 12 +++--- sources/zenodo.py | 30 +++++++++++--- templates/components/resources.html | 15 +++++++ templates/results.html | 51 ++++++++++++++++++++++++ 10 files changed, 199 insertions(+), 18 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/settings.json diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..3093fcd --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "windows-gcc-x86", + "includePath": [ + "${workspaceFolder}/**" + ], + "compilerPath": "E:/make/bin/gcc.exe", + "cStandard": "${default}", + "cppStandard": "${default}", + "intelliSenseMode": "windows-gcc-x86", + "compilerArgs": [ + "" + ] + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..dc0d342 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,62 @@ +{ + "C_Cpp_Runner.cCompilerPath": "gcc", + "C_Cpp_Runner.cppCompilerPath": "g++", + "C_Cpp_Runner.debuggerPath": "gdb", + "C_Cpp_Runner.cStandard": "", + "C_Cpp_Runner.cppStandard": "", + "C_Cpp_Runner.msvcBatchPath": "C:/Program Files/Microsoft Visual Studio/VR_NR/Community/VC/Auxiliary/Build/vcvarsall.bat", + "C_Cpp_Runner.useMsvc": false, + "C_Cpp_Runner.warnings": [ + "-Wall", + "-Wextra", + "-Wpedantic", + "-Wshadow", + "-Wformat=2", + "-Wcast-align", + "-Wconversion", + "-Wsign-conversion", + "-Wnull-dereference" + ], + "C_Cpp_Runner.msvcWarnings": [ + "/W4", + "/permissive-", + "/w14242", + "/w14287", + "/w14296", + "/w14311", + "/w14826", + "/w44062", + "/w44242", + "/w14905", + "/w14906", + "/w14263", + "/w44265", + "/w14928" + ], + "C_Cpp_Runner.enableWarnings": true, + "C_Cpp_Runner.warningsAsError": false, + "C_Cpp_Runner.compilerArgs": [], + "C_Cpp_Runner.linkerArgs": [], + "C_Cpp_Runner.includePaths": [], + "C_Cpp_Runner.includeSearch": [ + "*", + "**/*" + ], + "C_Cpp_Runner.excludeSearch": [ + "**/build", + "**/build/**", + "**/.*", + "**/.*/**", + "**/.vscode", + "**/.vscode/**" + ], + "C_Cpp_Runner.useAddressSanitizer": false, + "C_Cpp_Runner.useUndefinedSanitizer": false, + "C_Cpp_Runner.useLeakSanitizer": false, + "C_Cpp_Runner.showCompilationTime": false, + "C_Cpp_Runner.useLinkTimeOptimization": false, + "C_Cpp_Runner.msvcSecureNoWarnings": false, + "files.associations": { + "iostream": "cpp" + } +} \ No newline at end of file diff --git a/main.py b/main.py index b0d7f97..d60ac46 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ from flask import Flask, render_template, request, make_response, session from flask_session import Session import threading -from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications +from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications, openaire from sources import resodate, oersi, ieee, eudat, openaire_products from sources import dblp_researchers from sources import cordis, gesis, orcid, gepris, eulg, re3data, orkg @@ -76,7 +76,7 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - eudat, openaire_products, dblp_researchers, re3data, orkg] + eudat, openaire_products, dblp_researchers, re3data, orkg, gesis, eulg, openaire] # sources = [dblp_researchers] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) @@ -93,7 +93,7 @@ def search_results(): # sort all the results in each category results["publications"] = utils.sort_search_results(search_term, results["publications"]) results["researchers"] = utils.sort_search_results(search_term, results["researchers"]) - + results["resources"] = utils.sort_search_results(search_term, results["resources"]) #store the search results in the session session['search-results'] = copy.deepcopy(results) @@ -169,6 +169,21 @@ def load_more_researchers(): session['displayed_search_results']['researchers'] = displayed_search_results_researchers+number_of_records_to_append_on_lazy_load return render_template('components/researchers.html', results=results) +@app.route('/load-more-resources', methods=['GET']) +def load_more_resources(): + print('load more resources') + + #define a new results dict for resources to take new resources from the search results stored in the session + results = {} + results['resources'] = session['search-results']['resources'] + + total_search_results_resources = session['total_search_results']['resources'] + displayed_search_results_resources = session['displayed_search_results']['resources'] + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + results['resources'] = results['resources'][displayed_search_results_resources:displayed_search_results_resources+number_of_records_to_append_on_lazy_load] + session['displayed_search_results']['resources'] = displayed_search_results_resources+number_of_records_to_append_on_lazy_load + return render_template('components/resources.html', results=results) + @app.route('/are-embeddings-generated', methods=['GET']) def are_embeddings_generated(): print('are_embeddings_generated') diff --git a/objects.py b/objects.py index 11437f0..288c85e 100644 --- a/objects.py +++ b/objects.py @@ -73,6 +73,8 @@ class CreativeWork(thing): author: List[Union[Organization, Person]] = field(default_factory=list) citation: str = "" # this should actually reference to articles countryOfOrigin: str = "" + conditionsOfAccess: str = "" + contributor: List[Union[Organization, Person]] = field(default_factory=list) creativeWorkStatus: str = "" dateCreated: str = "" dateModified: str = "" @@ -93,7 +95,8 @@ class CreativeWork(thing): text: str = "" thumbnail: str = "" #ImageObject thumbnailUrl: str = "" #url - version: str = "" + version: str = "" + @dataclass class Article(CreativeWork): articleBody: str = "" diff --git a/requirements.txt b/requirements.txt index 6c339c3..315ff0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ # gradio~=3.9.1 +lxml==5.1.0 flask==2.3.2 extruct~=0.14.0 rdflib~=6.2.0 diff --git a/sources/eudat.py b/sources/eudat.py index 853ad1a..3536892 100644 --- a/sources/eudat.py +++ b/sources/eudat.py @@ -94,7 +94,7 @@ def search(search_term: str, results): _source.identifier = hit.get("id", "") # _source.url = hit.get('links', {}).get('self', '') # this gives json response _source.url = source_url_direct_access + _source.identifier - digitalObj.source.append(_source) + digitalObj.source = "EUDAT" if resource_type in ['DATASET', 'MODEL', 'AUDIOVISUAL']: results['resources'].append(digitalObj) diff --git a/sources/gesis.py b/sources/gesis.py index 22bb5e6..c585043 100644 --- a/sources/gesis.py +++ b/sources/gesis.py @@ -54,14 +54,12 @@ def search(search_term, results): resources.datePublished = date_published # publisher = dc_fields['publisher']['all'][0] if 'publisher' in dc_fields and 'all' in dc_fields['publisher'] else None # resources.publisher=publisher + + rights = dc_fields['rights']['all'][0] if 'rights' in dc_fields and 'all' in dc_fields['rights'] else None + resources.license = rights - # rights = dc_fields['rights']['all'][0] if 'rights' in dc_fields and 'all' in dc_fields['rights'] else None - # resources.license = rights - - languages = dc_fields['language']['all'] if 'language' in dc_fields and 'all' in dc_fields['language'] else '' - if languages: - for language in languages: - resources.inLanguage.append(language) + languages = dc_fields.get('language', {}).get('all', []) + resources.inLanguage.extend(languages) id = hit['_id'] id = id.replace('.', '-') diff --git a/sources/zenodo.py b/sources/zenodo.py index 591f3e4..b8b168f 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -53,7 +53,7 @@ def search(search_term, results): digitalObj.identifier = hit.get('doi', '') digitalObj.name = hit.get('title', '') digitalObj.url = hit.get('links', {}).get('self', '') - + digitalObj.genre = resource_type digitalObj.description = utils.remove_html_tags(metadata.get('description', '')) keywords = metadata.get('keywords', []) @@ -63,11 +63,14 @@ def search(search_term, results): language = metadata.get('language', '') digitalObj.inLanguage.append(language) - + digitalObj.dateCreated = hit.get('created','') + digitalObj.dateModified = hit.get('modified','') digitalObj.datePublished = metadata.get('publication_date', '') - digitalObj.license = metadata.get('license', {}).get('id', '') - + digitalObj.license = metadata.get('license', {}).get('id', '') + digitalObj.creativeWorkStatus = hit.get('status','') + #views, # resource type + digitalObj.conditionsOfAccess = metadata.get('access-rights','') authors = metadata.get("creators", []) for author in authors: @@ -78,15 +81,30 @@ def search(search_term, results): _author.affiliation = author.get("affiliation", "") digitalObj.author.append(_author) + contributors = metadata.get("contributors", []) + for contributor in contributors: + _contributor = Author() + _contributor.type = 'Person' + _contributor.name = contributor.get("name", "") + _contributor.identifier = contributor.get("orcid", "") + _contributor.affiliation = contributor.get("affiliation", "") + digitalObj.contributor.append(_contributor) + _source = thing() _source.name = source _source.identifier = hit.get("id", "") _source.url = hit.get('links', {}).get('self_html', '') - digitalObj.source.append(_source) + digitalObj.source = _source + if resource_type.upper() == 'PUBLICATION': digitalObj.abstract = digitalObj.description - + a, b = hit.get("journal", "").get('pages','').split('-') + digitalObj.pageStart = a + digitalObj.pageEnd = b + digitalObj.pagination = hit.get("journal", "").get('pages','') + + ############################# files = hit.get('files', []) for file in files: if file.get("key", "").endswith(".pdf"): diff --git a/templates/components/resources.html b/templates/components/resources.html index d3142dc..346c470 100644 --- a/templates/components/resources.html +++ b/templates/components/resources.html @@ -107,3 +107,18 @@
{% endfor %} + +
+
+ {% if session.displayed_search_results.resources + < session.total_search_results.resources %}
Displaying top {{ + session.displayed_search_results.resources }} resources out of + {{ session.total_search_results.resources }} +
+
+ + {% endif %} +
+
diff --git a/templates/results.html b/templates/results.html index 5339628..64a4273 100644 --- a/templates/results.html +++ b/templates/results.html @@ -357,6 +357,12 @@ load_more_researchers() }); + $('body').on('click', "#btn_load_more_resources", function (e) { + console.log('load more resources button clicked') + load_more_resources() + }); + + $(window).scroll(function () { @@ -392,6 +398,22 @@ } } + if ($('.nav-item .active').attr('id') == 'resources-tab') { + // Use the flag in the condition (so if sent and not yet received == false) + if ($('#btn_load_more_resources').is(":visible") && //load more button is visible + !ajax_request_sent && //no other ajax request is currently being processed + $(window).scrollTop() >= $(document).height() - $(window).height() - 500) { // scroll is about to reach the bottom of the page + + // Set the flag to prevent any concurring request + ajax_request_sent = true + + // ajax call get data from server and append to the div + console.log('load more via ajax now') + load_more_resources(); + } + } + + }); @@ -495,6 +517,35 @@ }); } + function load_more_resources() { + + $('#div_load_more_resources').remove() + jQuery.ajax({ + url: '/load-more-resources', + type: "GET", + beforeSend: function () { + $('#resources').append("
"); + }, + complete: function () { + $('.loader').remove(); + }, + // data: { + // action: "pj_load_more", + // pjCount: pjCount + // }, + success: function (data) { + $('#resources').append(data); + console.log('more resources loaded.') + + // Unset the flag + ajax_request_sent = false; + }, + error: function (err) { + console.log(err); + } + }); + } + From ba5fd289011b2b3964f9fa4a253855b578644643 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Sat, 15 Jun 2024 20:25:01 +0530 Subject: [PATCH 2/9] made some minor changes --- main.py | 8 +- objects.py | 35 +++++- sources/zenodo.py | 103 ++++++++++++++--- templates/components/resources.html | 50 ++++----- zen.py | 167 ++++++++++++++++++++++++++++ 5 files changed, 312 insertions(+), 51 deletions(-) create mode 100644 zen.py diff --git a/main.py b/main.py index d60ac46..af35b9c 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import os import uuid # from objects import Person, Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Institute, Funder, Publisher, Gesis, Cordis, Orcid, Gepris -from objects import Article, Organization, Person, Dataset, Project +from objects import Article, Organization, Person, Dataset, Project, CreativeWork, Statistics,SoftwareApplication, LearningResource, Dataset, Article from flask import Flask, render_template, request, make_response, session from flask_session import Session import threading @@ -75,9 +75,9 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module - sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - eudat, openaire_products, dblp_researchers, re3data, orkg, gesis, eulg, openaire] - # sources = [dblp_researchers] + # sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, + # eudat, openaire_products, dblp_researchers, re3data, orkg, gesis, eulg, openaire] + sources = [zenodo] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() diff --git a/objects.py b/objects.py index 288c85e..e8c80e0 100644 --- a/objects.py +++ b/objects.py @@ -1,4 +1,4 @@ -from typing import Union, List +from typing import Union, List, Dict import dataclasses from dataclasses import dataclass, fields, field @dataclass @@ -66,6 +66,17 @@ class Author(Person): works_count: str = "" cited_by_count: str = "" +@dataclass +class Statistics(thing): + downloads: str = "" + unique_downloads: str = "" + views: str = "" + unique_views: str = "" + version_downloads: str = "" + version_unique_downloads: str = "" + version_unique_views: str = "" + version_views: str = "" + @dataclass class CreativeWork(thing): abstract: str = "" @@ -79,7 +90,7 @@ class CreativeWork(thing): dateCreated: str = "" dateModified: str = "" datePublished: str = "" - encoding_contentUrl: str = "" + encoding_contentUrl: Dict[str, str] = field(default_factory=dict) encodingFormat: str = "" funder: Union[Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 funding: str = "" # we can change this to Grant @@ -95,8 +106,20 @@ class CreativeWork(thing): text: str = "" thumbnail: str = "" #ImageObject thumbnailUrl: str = "" #url - version: str = "" - + version: str = "" + stats: Statistics = None + cites: List[Union[str, str]] = field(default_factory=list) + isPartOf: List[Union[str, str]] = field(default_factory=list) + isSupplementTo : List[Union[str, str]] = field(default_factory=list) + isSourceOf : List[Union[str, str]] = field(default_factory=list) + isCitedBy : List[Union[str, str]] = field(default_factory=list) + hasPart: List[Union[str, str]] = field(default_factory=list) + isSupplementedBy: List[Union[str, str]] = field(default_factory=list) + isPreviousVersionOf: List[Union[str, str]] = field(default_factory=list) + isDerivedFrom: List[Union[str, str]] = field(default_factory=list) + documents: List[Union[str, str]] = field(default_factory=list) + + @dataclass class Article(CreativeWork): articleBody: str = "" @@ -104,6 +127,9 @@ class Article(CreativeWork): pageStart: str = "" pagination: str = "" wordCount: str = "" + issue: str = "" + Journal: str = "" + JournalVolume: str = "" @dataclass class Dataset(CreativeWork): @@ -129,6 +155,7 @@ class Project(Organization): class SoftwareApplication(CreativeWork): distribution: str = "" issn: str = "" + softwareVersion: str = "" @dataclass class LearningResource(CreativeWork): assesses: str = "" #The item being described is intended to assess the competency or learning outcome defined by the referenced term. diff --git a/sources/zenodo.py b/sources/zenodo.py index b8b168f..4d670fb 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -1,7 +1,7 @@ import requests import utils # from objects import Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Person, LearningResource, CreativeWork, VideoObject, ImageObject -from objects import thing, Article, Author, CreativeWork, Dataset, SoftwareApplication, VideoObject, ImageObject, LearningResource +from objects import thing, Article, Statistics, Author, CreativeWork, Dataset, SoftwareApplication, VideoObject, ImageObject, LearningResource import logging from sources import data_retriever import traceback @@ -64,13 +64,54 @@ def search(search_term, results): language = metadata.get('language', '') digitalObj.inLanguage.append(language) digitalObj.dateCreated = hit.get('created','') - digitalObj.dateModified = hit.get('modified','') + digitalObj.dateModified = hit.get('modified','') digitalObj.datePublished = metadata.get('publication_date', '') - digitalObj.license = metadata.get('license', {}).get('id', '') - digitalObj.creativeWorkStatus = hit.get('status','') - - #views, # resource type + digitalObj.license = metadata.get('license', {}).get('id', '') + digitalObj.creativeWorkStatus = hit.get('status','') + digitalObj.funder = metadata.get('grants', [{}])[0].get('funder', {}).get('name', '') digitalObj.conditionsOfAccess = metadata.get('access-rights','') + if(digitalObj.conditionsOfAccess == ''): + digitalObj.conditionsOfAccess = metadata.get('access_right','') + + relation_map = { + 'iscitedby': 'isCitedBy', + 'issupplementto': 'isSupplementTo', + 'ispartof': 'isPartOf', + 'cites': 'cites', + 'issourceof': 'isSourceOf', + 'isderivedfrom': 'isDerivedFrom', + 'issupplementedby': 'isSupplementedBy', + 'ispreviousversionof': 'isPreviousVersionOf', + 'documents': 'documents', + 'haspart': 'hasPart' + } + + related_identifiers = metadata.get('related_identifiers', []) + + for related_identifier in related_identifiers: + relation = related_identifier.get('relation', '').lower() + identifier = related_identifier.get('identifier', '') + + if relation == 'iscitedby': + digitalObj.isCitedBy.append(identifier) + elif relation == 'issupplementto': + digitalObj.isSupplementTo.append(identifier) + elif relation == 'ispartof': + digitalObj.isPartOf.append(identifier) + elif relation == 'cites': + digitalObj.cites.append(identifier) + elif relation == 'issourceof': + digitalObj.isSourceOf.append(identifier) + elif relation == 'isderivedfrom': + digitalObj.isDerivedFrom.append(identifier) + elif relation == 'issupplementedby': + digitalObj.isSupplementedBy.append(identifier) + elif relation == 'ispreviousversionof': + digitalObj.isPreviousVersionOf.append(identifier) + elif relation == 'documents': + digitalObj.documents.append(identifier) + elif relation == 'haspart': + digitalObj.hasPart.append(identifier) authors = metadata.get("creators", []) for author in authors: @@ -81,6 +122,20 @@ def search(search_term, results): _author.affiliation = author.get("affiliation", "") digitalObj.author.append(_author) + Stats = hit.get('stats', '') + _stats = Statistics() + + _stats.downloads = Stats.get("downloads", '') + _stats.unique_downloads = Stats.get("unique_downloads", '') + _stats.views = Stats.get("views", '') + _stats.unique_views = Stats.get("unique_views", '') + _stats.version_downloads = Stats.get("version_downloads", '') + _stats.version_unique_downloads = Stats.get("version_unique_downloads", '') + _stats.version_unique_views = Stats.get("version_unique_views", '') + _stats.version_views = Stats.get("version_views", '') + + digitalObj.stats = _stats + contributors = metadata.get("contributors", []) for contributor in contributors: _contributor = Author() @@ -94,22 +149,34 @@ def search(search_term, results): _source.name = source _source.identifier = hit.get("id", "") _source.url = hit.get('links', {}).get('self_html', '') - digitalObj.source = _source - + digitalObj.source.append(_source) + + files = hit.get('files', []) + + # if resource_type == "LESSON": + for file in files: + file_key = file.get("key", "") + digitalObj.encoding_contentUrl[file_key] = file.get("links", {}).get("self", "") + digitalObj.softwareVersion = metadata.get("version", "") if resource_type.upper() == 'PUBLICATION': digitalObj.abstract = digitalObj.description - a, b = hit.get("journal", "").get('pages','').split('-') - digitalObj.pageStart = a - digitalObj.pageEnd = b - digitalObj.pagination = hit.get("journal", "").get('pages','') - - ############################# - files = hit.get('files', []) - for file in files: - if file.get("key", "").endswith(".pdf"): - digitalObj.encoding_contentUrl = file.get("links", {}).get("self", "") + pages = hit.get("journal", {}).get('pages', '') + if '-' in pages: + a, b = pages.split('-') + digitalObj.pageStart = a.strip() + digitalObj.pageEnd = b.strip() + else: + digitalObj.pageStart = pages + digitalObj.pageEnd = '' + + digitalObj.pagination = pages + journal_info = metadata.get('journal', {}) + digitalObj.Journal = journal_info.get('title', '') + digitalObj.JournalVolume = journal_info.get('volume', '') + digitalObj.issue = journal_info.get('issue', '') + results['publications'].append(digitalObj) elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: results['resources'].append(digitalObj) diff --git a/templates/components/resources.html b/templates/components/resources.html index 346c470..3fd0d68 100644 --- a/templates/components/resources.html +++ b/templates/components/resources.html @@ -20,12 +20,13 @@
- {% for author in resources.author %} + {% for author in resources.author %} {% if author.type == 'Person' %} {{author.name}} + tabindex="-1" role="button" aria-disabled="true"> + {{author.name}} + {% endif %} {% endfor %}
@@ -37,34 +38,33 @@
- {{resources.source}} - {% for language in resources.inLanguage %} - {{language|upper}} + {% for source in resources.source %} + {{ source.name }} + {% for language in source.inLanguage %} + {{ language|upper }} + {% endfor %} {% endfor %} + {{ resources.license }} {{ resources.encodingFormat|upper }} - {{resources.license}} -
- {% if resources.source == 'GESIS' or resources.source == 'Zenodo' %} -
- {{resources.datePublished}} -
- {% endif %} - {% if resources.source == 'GEPRIS' %} -
- {{resources.dateLastModified}} -
- {% endif %} - {% if resources.source == 'CODALAB' %} -
- {{resources.dateCreated}} +
+ {% for keyword in resources.keywords %} + {{ keyword }} + {% endfor %}
- {% endif %} - {% if resources.source == 'elg:corpus' or resources.source == 'elg:software/service'%} +
- {{resources.datePublished}} + {% if resources.source == 'GESIS' or resources.source[0].name == 'Zenodo' %} + {{ resources.datePublished }} + {% elif resources.source == 'GEPRIS' %} + {{ resources.dateLastModified }} + {% elif resources.source == 'CODALAB' %} + {{ resources.dateCreated }} + {% elif resources.source == 'elg:corpus' or resources.source == 'elg:software/service' %} + {{ resources.datePublished }} + {% endif %}
- {% endif %}
+
diff --git a/zen.py b/zen.py new file mode 100644 index 0000000..3c2853e --- /dev/null +++ b/zen.py @@ -0,0 +1,167 @@ +import requests +import utils +# from objects import Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Person, LearningResource, CreativeWork, VideoObject, ImageObject +from objects import thing, Article, Author, CreativeWork, Dataset, SoftwareApplication, VideoObject, ImageObject, LearningResource, Statistics +import logging +from sources import data_retriever +import traceback + +# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_term, results): + + source = "Zenodo" + try: + search_result = data_retriever.retrieve_data(source=source, + base_url=utils.config["search_url_zenodo"], + search_term=search_term, + results=results) + + total_records_found = search_result.get("hits", {}).get("total", 0) + hits = search_result.get("hits", {}).get("hits", []) + total_hits = len(hits) + logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + + if int(total_hits) > 0: + for hit in hits: + + metadata = hit.get('metadata', {}) + resource_type = metadata.get('resource_type', {}).get('type','OTHER').upper() + + if resource_type == 'PUBLICATION': + digitalObj = Article() + elif resource_type in ['PRESENTATION', 'POSTER']: + digitalObj = CreativeWork() + elif resource_type == 'DATASET': + digitalObj = Dataset() + elif resource_type == 'VIDEO': + digitalObj = VideoObject() + elif resource_type == 'IMAGE': + digitalObj = ImageObject() + elif resource_type == 'LESSON': + digitalObj = LearningResource() + elif resource_type == 'SOFTWARE': + digitalObj = SoftwareApplication() + elif resource_type == 'OTHER': + digitalObj = CreativeWork() + else: + print('This resource type is still not defined:', resource_type) + digitalObj = CreativeWork() + + digitalObj.identifier = hit.get('doi', '') + digitalObj.name = hit.get('title', '') + digitalObj.url = hit.get('links', {}).get('self', '') + digitalObj.genre = resource_type + digitalObj.description = utils.remove_html_tags(metadata.get('description', '')) + + keywords = metadata.get('keywords', []) + if isinstance(keywords, list): + # for keyword in keywords: + # digitalObj.keywords.append(keyword) + for keyword in keywords: + terms = [term.strip() for term in keyword.split(",")] + digitalObj.keywords.extend(terms) + + language = metadata.get('language', '') + digitalObj.inLanguage.append(language) + digitalObj.dateCreated = hit.get('created','') + digitalObj.dateModified = hit.get('modified','') + digitalObj.datePublished = metadata.get('publication_date', '') + digitalObj.license = metadata.get('license', {}).get('id', '') + digitalObj.creativeWorkStatus = hit.get('status','') + digitalObj.funder = metadata.get('grants', {}).get('funder', '').get('name','') + + #views, # resource type + digitalObj.conditionsOfAccess = metadata.get('access-rights','') + if(digitalObj.conditionsOfAccess == ''): + digitalObj.conditionsOfAccess = metadata.get('access_right','') + + authors = metadata.get("creators", []) + for author in authors: + _author = Author() + _author.type = 'Person' + _author.name = author.get("name", "") + _author.identifier = author.get("orcid", "") + _author.affiliation = author.get("affiliation", "") + digitalObj.author.append(_author) + + Stats = hit.get('stats', '') + _stats = Statistics() + + _stats.downloads = Stats.get("downloads", '') + _stats.unique_downloads = Stats.get("unique_downloads", '') + _stats.views = Stats.get("views", '') + _stats.unique_views = Stats.get("unique_views", '') + _stats.version_downloads = Stats.get("version_downloads", '') + _stats.version_unique_downloads = Stats.get("version_unique_downloads", '') + _stats.version_unique_views = Stats.get("version_unique_views", '') + _stats.version_views = Stats.get("version_views", '') + + digitalObj.stats = _stats + + # relation = metadata.get('related_identifiers', '').get('relation', '').lower() + # identifier = metadata.get('related_identifiers', '').get('identifier','').lower() + # relation_map = { + # 'iscitedby': 'isCitedBy', + # 'issupplementto': 'isSupplementTo', + # 'ispartof': 'isPartOf', + # 'cites': 'cites', + # 'issourceof': 'isSourceOf', + # 'isderivedfrom': 'isDerivedFrom', + # 'issupplementedby': 'isSupplementedBy', + # 'ispreviousversionof': 'isPreviousVersionOf', + # 'documents': 'documents', + # 'haspart': 'hasPart' + # } + # if relation in relation_map: + # getattr(digitalObj, relation_map[relation]).append(identifier) + + contributors = metadata.get("contributors", []) + for contributor in contributors: + _contributor = Author() + _contributor.type = 'Person' + _contributor.name = contributor.get("name", "") + _contributor.identifier = contributor.get("orcid", "") + _contributor.affiliation = contributor.get("affiliation", "") + digitalObj.contributor.append(_contributor) + + _source = thing() + _source.name = source + _source.identifier = hit.get("id", "") + _source.url = hit.get('links', {}).get('self_html', '') + digitalObj.source.append(_source) + + files = hit.get('files', []) + + # if resource_type == "LESSON": + for file in files: + file_key = file.get("key", "") + digitalObj.encoding_contentUrl[file_key] = file.get("links", {}).get("self", "") + + digitalObj.softwareVersion = metadata.get("version", "") + if resource_type.upper() == 'PUBLICATION': + digitalObj.abstract = digitalObj.description + a, b = hit.get("journal", "").get('pages','').split('-') + digitalObj.pageStart = a + digitalObj.pageEnd = b + digitalObj.pagination = hit.get("journal", "").get('pages','') + digitalObj.Jounral = metadata.get('journal').get('title', '') + digitalObj.JournalVolume = metadata.get('journal').get('volume', '') + digitalObj.issue = metadata.get('journal').get('issue', '') + ############################# + + results['publications'].append(digitalObj) + elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: + results['resources'].append(digitalObj) + else: + results['others'].append(digitalObj) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append(source) + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) \ No newline at end of file From 8369d8495a14cc8fe6e22907bdea37f6e27209ba Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Thu, 27 Jun 2024 12:46:10 +0530 Subject: [PATCH 3/9] minor changes --- main.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index af35b9c..2b02148 100644 --- a/main.py +++ b/main.py @@ -75,9 +75,9 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module - # sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - # eudat, openaire_products, dblp_researchers, re3data, orkg, gesis, eulg, openaire] - sources = [zenodo] + sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, + eudat, openaire_products, dblp_researchers, re3data, orkg, gesis, eulg, openaire] + # sources = [zenodo] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() @@ -89,7 +89,6 @@ def search_results(): # deduplicator.convert_publications_to_csv(results["publications"]) # results["publications"] = deduplicator.perform_entity_resolution_publications(results["publications"]) - # sort all the results in each category results["publications"] = utils.sort_search_results(search_term, results["publications"]) results["researchers"] = utils.sort_search_results(search_term, results["researchers"]) From 1245b9af5a632873c5ab7fea43f831fd52caabc8 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Tue, 9 Jul 2024 10:50:17 +0530 Subject: [PATCH 4/9] resources details page completed --- main.py | 24 ++- sources/eulg.py | 7 + sources/ieee.py | 2 +- sources/zenodo.py | 129 +++++++++----- templates/components/resources.html | 55 ++++-- templates/resource-details.html | 250 +++++++++++++++++++--------- 6 files changed, 320 insertions(+), 147 deletions(-) diff --git a/main.py b/main.py index 07c24cc..554f42f 100644 --- a/main.py +++ b/main.py @@ -8,7 +8,6 @@ from flask_session import Session import threading from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications, wikidata_researchers -from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications, openaire from sources import resodate, oersi, ieee, eudat, openaire_products from sources import dblp_researchers from sources import crossref, semanticscholar @@ -78,10 +77,8 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - eudat, openaire_products, dblp_researchers, re3data, orkg, gesis, eulg] - # sources = [zenodo] + eudat, dblp_researchers, re3data, orkg, gesis, eulg] - # sources = [openalex_publications] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() @@ -311,17 +308,18 @@ def publication_details_citations(doi): print("response:", response) return response -@app.route('/resource-details') -def resource_details(): - response = make_response(render_template('resource-details.html')) +@app.route('/resource-details/', methods=['GET']) +def resource_details(sources): - # Set search-session cookie to the session cookie value of the first visit - if request.cookies.get('search-session') is None: - if request.cookies.get('session') is None: - response.set_cookie('search-session', str(uuid.uuid4())) - else: - response.set_cookie('search-session', request.cookies['session']) + sources = unquote(sources) + sources = ast.literal_eval(sources) + for source in sources: + doi = source['doi'] + + resource = zenodo.get_resource(doi="https://doi.org/"+doi) + response = make_response(render_template('resource-details.html', resource=resource)) + print("response:", response) return response diff --git a/sources/eulg.py b/sources/eulg.py index 7a397b2..edc4669 100644 --- a/sources/eulg.py +++ b/sources/eulg.py @@ -66,7 +66,11 @@ def _get(self, path: str, queries: List[set] = [], json: bool = False): dataset.name = result.resource_name dataset.url = url dataset.datePublished = str(result.creation_date) + dataset.dateModified = str(result.last_date_updated) dataset.description = description + dataset.version = result.version + dataset.encoding_contentUrl = result.detail + # dataset.conditionsOfAccess = result.condition_of_use[0] keywords = result.keywords if isinstance(keywords, list): for keyword in keywords: @@ -115,6 +119,9 @@ def _get(self, path: str, queries: List[set] = [], json: bool = False): software.name = result.resource_name software.url = url software.description = description + # software.version = result.version + software.encoding_contentUrl = result.detail + software.conditionsOfAccess - result.condition_of_use[0] software.datePublished = str(result.creation_date) software.countryOfOrigin = result.country_of_registration keywords = result.keywords diff --git a/sources/ieee.py b/sources/ieee.py index d0dc87f..2b5667a 100644 --- a/sources/ieee.py +++ b/sources/ieee.py @@ -65,7 +65,7 @@ def search(search_term, results): _source.name = source _source.identifier = hit.get("article_number", "") _source.url = hit.get("html_url", "") - publication.source.append(_source) + publication.source.append(_source) results['publications'].append(publication) diff --git a/sources/zenodo.py b/sources/zenodo.py index 4d670fb..a83974c 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -59,13 +59,14 @@ def search(search_term, results): keywords = metadata.get('keywords', []) if isinstance(keywords, list): for keyword in keywords: - digitalObj.keywords.append(keyword) + terms = [term.strip() for term in keyword.split(",")] + digitalObj.keywords.extend(terms) language = metadata.get('language', '') digitalObj.inLanguage.append(language) digitalObj.dateCreated = hit.get('created','') digitalObj.dateModified = hit.get('modified','') - digitalObj.datePublished = metadata.get('publication_date', '') + digitalObj.datePublished = metadata.get('resource_date', '') digitalObj.license = metadata.get('license', {}).get('id', '') digitalObj.creativeWorkStatus = hit.get('status','') digitalObj.funder = metadata.get('grants', [{}])[0].get('funder', {}).get('name', '') @@ -73,45 +74,45 @@ def search(search_term, results): if(digitalObj.conditionsOfAccess == ''): digitalObj.conditionsOfAccess = metadata.get('access_right','') - relation_map = { - 'iscitedby': 'isCitedBy', - 'issupplementto': 'isSupplementTo', - 'ispartof': 'isPartOf', - 'cites': 'cites', - 'issourceof': 'isSourceOf', - 'isderivedfrom': 'isDerivedFrom', - 'issupplementedby': 'isSupplementedBy', - 'ispreviousversionof': 'isPreviousVersionOf', - 'documents': 'documents', - 'haspart': 'hasPart' - } - - related_identifiers = metadata.get('related_identifiers', []) - - for related_identifier in related_identifiers: - relation = related_identifier.get('relation', '').lower() - identifier = related_identifier.get('identifier', '') + # relation_map = { + # 'iscitedby': 'isCitedBy', + # 'issupplementto': 'isSupplementTo', + # 'ispartof': 'isPartOf', + # 'cites': 'cites', + # 'issourceof': 'isSourceOf', + # 'isderivedfrom': 'isDerivedFrom', + # 'issupplementedby': 'isSupplementedBy', + # 'ispreviousversionof': 'isPreviousVersionOf', + # 'documents': 'documents', + # 'haspart': 'hasPart' + # } + + # related_identifiers = metadata.get('related_identifiers', []) + + # for related_identifier in related_identifiers: + # relation = related_identifier.get('relation', '').lower() + # identifier = related_identifier.get('identifier', '') - if relation == 'iscitedby': - digitalObj.isCitedBy.append(identifier) - elif relation == 'issupplementto': - digitalObj.isSupplementTo.append(identifier) - elif relation == 'ispartof': - digitalObj.isPartOf.append(identifier) - elif relation == 'cites': - digitalObj.cites.append(identifier) - elif relation == 'issourceof': - digitalObj.isSourceOf.append(identifier) - elif relation == 'isderivedfrom': - digitalObj.isDerivedFrom.append(identifier) - elif relation == 'issupplementedby': - digitalObj.isSupplementedBy.append(identifier) - elif relation == 'ispreviousversionof': - digitalObj.isPreviousVersionOf.append(identifier) - elif relation == 'documents': - digitalObj.documents.append(identifier) - elif relation == 'haspart': - digitalObj.hasPart.append(identifier) + # if relation == 'iscitedby': + # digitalObj.isCitedBy.append(identifier) + # elif relation == 'issupplementto': + # digitalObj.isSupplementTo.append(identifier) + # elif relation == 'ispartof': + # digitalObj.isPartOf.append(identifier) + # elif relation == 'cites': + # digitalObj.cites.append(identifier) + # elif relation == 'issourceof': + # digitalObj.isSourceOf.append(identifier) + # elif relation == 'isderivedfrom': + # digitalObj.isDerivedFrom.append(identifier) + # elif relation == 'issupplementedby': + # digitalObj.isSupplementedBy.append(identifier) + # elif relation == 'ispreviousversionof': + # digitalObj.isPreviousVersionOf.append(identifier) + # elif relation == 'documents': + # digitalObj.documents.append(identifier) + # elif relation == 'haspart': + # digitalObj.hasPart.append(identifier) authors = metadata.get("creators", []) for author in authors: @@ -177,7 +178,7 @@ def search(search_term, results): digitalObj.JournalVolume = journal_info.get('volume', '') digitalObj.issue = journal_info.get('issue', '') - results['publications'].append(digitalObj) + results['publication'].append(digitalObj) elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: results['resources'].append(digitalObj) else: @@ -187,6 +188,52 @@ def search(search_term, results): logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append(source) + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) + + +@utils.timeit +def get_resource(doi: str): + + source = "Zenodo" + + try: + search_result = data_retriever.retrieve_single_object(source=source, + base_url=utils.config["search_url_zenodo"], + doi=doi) + + metadata = search_result.get('metadata', {}) + resource = CreativeWork() + resource.name = search_result.get("title", "") + resource.url = search_result.get('links', {}).get('self', '') + resource.identifier = search_result.get("doi", "") + resource.datePublished = metadata.get("publication_date", "") + resource.inLanguage.append(metadata.get("language", "")) + resource.license = metadata.get("license", "") + + resource.description = utils.remove_html_tags(metadata.get("description", "")) + resource.abstract = resource.description + authors = search_result.get("creators", []) + for author in authors: + _author = Author() + _author.type = 'Person' + _author.name = author.get("name", "") + _author.identifier = author.get("orcid", "") + _author.affiliation = author.get("affiliation", "") + resource.author.append(_author) + + keywords = metadata.get('keywords', []) + if isinstance(keywords, list): + for keyword in keywords: + terms = [term.strip() for term in keyword.split(",")] + resource.keywords.extend(terms) + + return resource + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/templates/components/resources.html b/templates/components/resources.html index 3fd0d68..9523f75 100644 --- a/templates/components/resources.html +++ b/templates/components/resources.html @@ -13,21 +13,28 @@
-
- - {{resources.name}} +
+ + {{ resources.name }} + + DOI:{{ resources.identifier }}
- {% for author in resources.author %} + {% set author_count = namespace(value=0) %} + {% for author in resources.author %} + {% if author_count.value == 5 %} + and {{ (resources.author|count) - 5 }} + more + {% endif %} {% if author.type == 'Person' %} - {{author.name}} - + class="{% if author_count.value > 4 %}d-none{% endif %} btn btn-outline-dark text-dark rounded-pill border-1 p-1 pe-2 mb-1" + tabindex="-1" role="button" aria-disabled="true">{{author.name}} {% endif %} + {% set author_count.value = author_count.value + 1 %} {% endfor %}
@@ -38,17 +45,33 @@
- {% for source in resources.source %} - {{ source.name }} - {% for language in source.inLanguage %} - {{ language|upper }} + {% set found = false %} + + {% if resources.source.name %} + + {{ resources.source.name }} + + {% set found = true %} + {% endif %} + + {% if not found %} + + + {{ resources.source }} + + + {% endif %} + {% if resources.source.inLanguage %} + {% for language in resources.source.inLanguage %} + {{ language|upper }} {% endfor %} - {% endfor %} - {{ resources.license }} - {{ resources.encodingFormat|upper }} + {% endif %} + + {{ resources.license }} + {{ resources.encodingFormat|upper }}
{% for keyword in resources.keywords %} - {{ keyword }} + {{ keyword }} {% endfor %}
diff --git a/templates/resource-details.html b/templates/resource-details.html index 3035664..7a605c1 100644 --- a/templates/resource-details.html +++ b/templates/resource-details.html @@ -1,6 +1,6 @@ {% extends "layouts/base.html" %} -{% block title %} Resources Details {% endblock title %} +{% block title %} resource Details {% endblock title %} {% block stylesheets %} @@ -15,31 +15,31 @@
-

Women and Politics

+

{{resource.name}}

- Politics - Social engagement - Politicians - Demography - Social Dataset +
+ {% for keyword in resource.keywords %} + {{keyword}} + {% endfor %} +
- Study number: + Version: ZA6719 DOI: - 10.4232 + {{resource.identifier}} Publication Date: - 03.10.2019 + {{resource.datePublished}}
@@ -47,22 +47,22 @@

Women and Politics

Downloads
-
1K+
+
--
Saved
-
420
+
--
Cited
-
300
+
--
Views
-
278
+
--
@@ -70,70 +70,65 @@

Women and Politics

-
-
-
AUTHORS (5)
-
-
- Author 1 -
-
- Author 2 -
-
- Author 3 +
+
- Author 4 +
AUTHORS ({{resource.author|length}})
-
- Author 5 +
+ {% for author in resource.author %} + + {% endfor %}
-
-
ABSTRACT
+
+
ABSTRACT
- - The survey focused on women's interest in politics in general and in specific policy areas in particular, - their expectations of political parties and politicians, and their willingness to become politically or socially involved. - Of particular interest here was the question of the extent to which there are differences in attitudes, - experiences and expectations within the group of women. - Do women academics have other political interests than women with low formal education? - Or do the expectations that women in West Germany have of politics differ from those of women in East Germany? - It should also be ascertained whether women perceive female politicians differently from male politicians. - Are there characteristics that they attribute more to a male politician or more to a female politician? After all, - the issue of equal rights played a prominent role in the survey. - The perceived realisation of equal rights for men and women in Germany in general and in selected areas of life was measured. - - Topics: Political understanding and interest: interest in politics; particularly interesting political topics; - attitude towards elites (politics and political parties); frequency of discussions on political and social topics with various groups - (life partner or family, friends and acquaintances, in a public political event, colleagues, sports and hobby clubs, - on Facebook or in other social networks); political information behaviour (informing people who lack knowledge about a political topic, - intensive reflection on political issues, dealing with various party positions, - interest in the person in office of important political offices, own suggestions - for improvement with regard to living conditions in the place of residence); - attitude towards political styles: opinion on how parties and politicians deal with each other (should be considerate, - can be hard to deal with as long as it is content and not personal, - or occasionally personal attacks on politicians of other parties are fine); political discussion behaviour. + {{ resource.description}}
-
-
POLITICAL AND SOCIAL ENGAGEMENT
+
+
SUPPLEMENTAL MATERIAL
- +
+ + Coming soon .... + +
-
-
REFERENCES
+
+
REFERENCES
-
+
+ +
+
+
CITATIONS
+
+
+ +
+
+ -
+
-
RECOMMENDATIONS
+
RECOMMENDATIONS
+
+
+
-
+ +
FAIR Assessment
- +
+ + Coming soon .... + +
Jupyter Lab
- +
+ + Coming soon .... + +
@@ -235,10 +242,101 @@
Jupyter Lab
$(document).ready(function () { + var tooltipTriggerList = [].slice.call(document.querySelectorAll('[data-bs-toggle="tooltip"]')) + var tooltipList = tooltipTriggerList.map(function (tooltipTriggerEl) { + return new bootstrap.Tooltip(tooltipTriggerEl) + }) }); - + load_references = function () { + let doi = document.getElementById('resource_doi').innerHTML + $.ajax({ + url: '/resource-details-references/' + doi, + type: "GET", + data: { + }, + beforeSend: function () { + $('#references_block').html("
"); + }, + complete: function () { + // $('.loader').remove(); + }, + success: function (data) { + console.log(data) + $('.loader').remove(); + $('#references_block').html(data); + console.log('references have been loaded.') + }, + error: function (err) { + console.log(err); + return err + } + }); + } + + load_recommendations = function () { + let doi = document.getElementById('resource_doi').innerHTML + $.ajax({ + url: '/resource-details-recommendations/' + doi, + type: "GET", + data: { + }, + beforeSend: function () { + $('#recommendations_block').html("
"); + }, + complete: function () { + $('.loader').remove(); + }, + success: function (data) { + console.log(data) + $('#recommendations_block').html(data); + console.log('recommendations have been loaded.') + + }, + error: function (err) { + console.log(err); + return err + } + }); + } + + load_citations = function () { + let doi = document.getElementById('resource_doi').innerHTML + $.ajax({ + url: '/resource-details-citations/' + doi, + type: "GET", + data: { + }, + beforeSend: function () { + $('#citations_block').html("
"); + }, + complete: function () { + $('.loader').remove(); + }, + success: function (data) { + console.log(data) + $('#citations_block').html(data); + console.log('citations have been loaded.') + }, + error: function (err) { + console.log(err); + return err + } + }); + } + + $('#btn-load-references').click(function () { + load_references(); + }); + + $('#btn-load-recommendations').click(function () { + load_recommendations(); + }); + + $('#btn-load-citations').click(function () { + load_citations(); + }); From c921d796e2aa67b5b69a2ba89ada678cb6161c39 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Tue, 9 Jul 2024 11:23:40 +0530 Subject: [PATCH 5/9] Resources details page updated --- .github/workflows/main.yml | 7 +- config.yaml | 6 +- main.py | 149 ++++--- objects.py | 57 ++- requirements.txt | 4 +- sources/data_retriever.py | 4 +- sources/dblp_researchers.py | 2 - sources/gepris.py | 15 +- sources/openalex_researchers.py | 327 ++++++++++++-- sources/orcid.py | 35 +- sources/wikidata_researchers.py | 89 ++-- static/images/sources/semantic-scholar.png | Bin 0 -> 612 bytes templates/components/publications.html | 185 ++------ templates/components/researchers.html | 16 +- templates/partials/common/download-modal.html | 70 +++ templates/partials/common/preview-modal.html | 20 + templates/partials/common/share-modal.html | 127 ++++++ templates/researcher-details.html | 418 ++++++++++-------- templates/results.html | 38 -- utils.py | 10 + 20 files changed, 1012 insertions(+), 567 deletions(-) create mode 100644 static/images/sources/semantic-scholar.png create mode 100644 templates/partials/common/download-modal.html create mode 100644 templates/partials/common/preview-modal.html create mode 100644 templates/partials/common/share-modal.html diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 33f5797..f159ac8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ name: 'Deploy NFDI Search Engine on sems-kg-1' on: workflow_dispatch: push: - branches: + branches: - main jobs: @@ -18,6 +18,9 @@ jobs: - name: 'Delete old Docker image' run: docker image rm nfdi-search-engine-search-engine - name: 'Copy logging.conf' - run: cp logging.conf.example logging.conf + run: cp logging.conf.example logging.conf + - name: 'Create .env' + run: | + echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> ./.env - name: 'Re-build Docker container from current source' run: docker compose up --force-recreate --build --detach diff --git a/config.yaml b/config.yaml index e1632ee..2845f66 100644 --- a/config.yaml +++ b/config.yaml @@ -36,4 +36,8 @@ chatbot_server: https://nfdi-chatbot.nliwod.org # chatbot_server: http://127.0.0.1:5005 endpoint_chat: /chat endpoint_save_docs_with_embeddings: /save-docs-with-embeddings -endpoint_are_embeddings_generated: /are-embeddings-generated \ No newline at end of file +endpoint_are_embeddings_generated: /are-embeddings-generated + +#open ai +openai_model_version: gpt-3.5-turbo-0125 +openai_temperature: 2 \ No newline at end of file diff --git a/main.py b/main.py index 554f42f..1cff48c 100644 --- a/main.py +++ b/main.py @@ -3,12 +3,12 @@ import os import uuid # from objects import Person, Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Institute, Funder, Publisher, Gesis, Cordis, Orcid, Gepris -from objects import Article, Organization, Person, Dataset, Project, CreativeWork, Statistics,SoftwareApplication, LearningResource, Dataset, Article -from flask import Flask, render_template, request, make_response, session +from objects import Article, Organization, Person, Dataset, Project +from flask import Flask, render_template, request, make_response, session, requests from flask_session import Session import threading -from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications, wikidata_researchers -from sources import resodate, oersi, ieee, eudat, openaire_products +from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications, wikidata_researchers, openalex_researchers +from sources import resodate, oersi, ieee, eudat, openaire_products, openalex_publications from sources import dblp_researchers from sources import crossref, semanticscholar from sources import cordis, gesis, orcid, gepris, eulg, re3data, orkg @@ -32,8 +32,23 @@ app.config["SESSION_TYPE"] = "filesystem" Session(app) +results = { + 'publications': [], + 'researchers': [], + 'resources': [], + 'organizations': [], + 'events': [], + 'fundings': [], + 'others': [], + 'timedout_sources': [] + } + @app.route('/') def index(): + + if (utils.env_config["OPENAI_API_KEY"] == ""): + return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) + response = make_response(render_template('index.html')) # Set search-session cookie to the session cookie value of the first visit @@ -62,23 +77,14 @@ def search_results(): search_term = request.args.get('txtSearchTerm') session['search-term'] = search_term - results = { - 'publications': [], - 'researchers': [], - 'resources': [], - 'organizations': [], - 'events': [], - 'fundings': [], - 'others': [], - 'timedout_sources': [] - } + for k in results.keys(): results[k] = [] threads = [] # add all the sources here in this list; for simplicity we should use the exact module name - # ensure the main method which execute the search is named "search" in the module + # ensure the main method which execute the search is named "search" in the module sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - eudat, dblp_researchers, re3data, orkg, gesis, eulg] - + eudat, openaire_products, dblp_researchers, re3data, orkg] + # sources = [openalex_publications] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() @@ -91,27 +97,26 @@ def search_results(): # deduplicator.convert_publications_to_csv(results["publications"]) # results["publications"] = deduplicator.perform_entity_resolution_publications(results["publications"]) # sort all the results in each category - results["publications"] = utils.sort_search_results(search_term, results["publications"]) - results["researchers"] = utils.sort_search_results(search_term, results["researchers"]) + results["publications"] = utils.sort_search_results(search_term, results["publications"]) + results["researchers"] = utils.sort_search_results(search_term, results["researchers"]) results["resources"] = utils.sort_search_results(search_term, results["resources"]) #store the search results in the session session['search-results'] = copy.deepcopy(results) - # Chatbot - push search results to chatbot server for embeddings generation - if utils.config['chatbot_feature_enable'] == "True": + if (utils.config['chatbot_feature_enable']): # Convert a UUID to a 32-character hexadecimal string search_uuid = uuid.uuid4().hex session['search_uuid'] = search_uuid - + def send_search_results_to_chatbot(search_uuid: str): print('request is about to start') - chatbot_server = utils.config['chatbot_server'] - save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings'] - request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}' + chatbot_server = utils.config['chatbot_server'] + save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings'] + request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}' response = requests.post(request_url, json=json.dumps(results, default=vars)) - response.raise_for_status() + response.raise_for_status() print('request completed') # create a new daemon thread @@ -119,13 +124,13 @@ def send_search_results_to_chatbot(search_uuid: str): # start the new thread chatbot_thread.start() # sleep(1) - + # on the first page load, only push top 20 records in each category - number_of_records_to_show_on_page_load = int(utils.config["number_of_records_to_show_on_page_load"]) - total_results = {} # the dict to keep the total number of search results + number_of_records_to_show_on_page_load = int(utils.config["number_of_records_to_show_on_page_load"]) + total_results = {} # the dict to keep the total number of search results displayed_results = {} # the dict to keep the total number of search results currently displayed to the user - + for k, v in results.items(): logger.info(f'Got {len(v)} {k}') total_results[k] = len(v) @@ -133,12 +138,12 @@ def send_search_results_to_chatbot(search_uuid: str): displayed_results[k] = len(results[k]) results["timedout_sources"] = list(set(results["timedout_sources"])) - logger.info('Following sources got timed out:' + ','.join(results["timedout_sources"])) + logger.info('Following sources got timed out:' + ','.join(results["timedout_sources"])) session['total_search_results'] = total_results - session['displayed_search_results'] = displayed_results - - template_response = render_template('results.html', results=results, total_results=total_results, search_term=search_term) + session['displayed_search_results'] = displayed_results + + template_response = render_template('results.html', results=results, total_results=total_results, search_term=search_term) logger.info('search server call completed - after render call') return template_response @@ -153,10 +158,10 @@ def load_more_publications(): total_search_results_publications = session['total_search_results']['publications'] displayed_search_results_publications = session['displayed_search_results']['publications'] - number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) results['publications'] = results['publications'][displayed_search_results_publications:displayed_search_results_publications+number_of_records_to_append_on_lazy_load] session['displayed_search_results']['publications'] = displayed_search_results_publications+number_of_records_to_append_on_lazy_load - return render_template('components/publications.html', results=results) + return render_template('components/publications.html', results=results) @app.route('/load-more-researchers', methods=['GET']) def load_more_researchers(): @@ -168,10 +173,10 @@ def load_more_researchers(): total_search_results_researchers = session['total_search_results']['researchers'] displayed_search_results_researchers = session['displayed_search_results']['researchers'] - number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) results['researchers'] = results['researchers'][displayed_search_results_researchers:displayed_search_results_researchers+number_of_records_to_append_on_lazy_load] session['displayed_search_results']['researchers'] = displayed_search_results_researchers+number_of_records_to_append_on_lazy_load - return render_template('components/researchers.html', results=results) + return render_template('components/researchers.html', results=results) @app.route('/load-more-resources', methods=['GET']) def load_more_resources(): @@ -183,25 +188,25 @@ def load_more_resources(): total_search_results_resources = session['total_search_results']['resources'] displayed_search_results_resources = session['displayed_search_results']['resources'] - number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) results['resources'] = results['resources'][displayed_search_results_resources:displayed_search_results_resources+number_of_records_to_append_on_lazy_load] session['displayed_search_results']['resources'] = displayed_search_results_resources+number_of_records_to_append_on_lazy_load - return render_template('components/resources.html', results=results) + return render_template('components/resources.html', results=results) @app.route('/are-embeddings-generated', methods=['GET']) def are_embeddings_generated(): #Check the embeddings readiness only if the chatbot feature is enabled otherwise return False - if utils.config['chatbot_feature_enable'] == "True": + if (utils.config['chatbot_feature_enable']): print('are_embeddings_generated') uuid = session['search_uuid'] - chatbot_server = utils.config['chatbot_server'] - are_embeddings_generated = utils.config['endpoint_are_embeddings_generated'] - request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}" + chatbot_server = utils.config['chatbot_server'] + are_embeddings_generated = utils.config['endpoint_are_embeddings_generated'] + request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}" headers = { 'Content-Type': 'application/json' } - response = requests.request("GET", request_url, headers=headers) + response = requests.request("GET", request_url, headers=headers) json_response = response.json() print('json_response:', json_response) return str(json_response['file_exists']) @@ -220,7 +225,7 @@ def get_chatbot_answer(): search_uuid = session['search_uuid'] answer = chatbot.getAnswer(question=question, search_uuid=search_uuid) - + return answer @@ -250,6 +255,8 @@ def format_digital_obj_url(value): else: source_dict['sname'] = source.name source_dict['sid'] = value.identifier + source_dict['sname'] = source.name + source_dict['sid'] = source.identifier sources_list.append(source_dict) return json.dumps(sources_list) FILTERS["format_digital_obj_url"] = format_digital_obj_url @@ -257,10 +264,16 @@ def format_digital_obj_url(value): def format_authors_for_citations(value): authors = "" for author in value: - authors += (author.name + " and ") + authors += (author.name + " and ") return authors.rstrip(' and ') + "." FILTERS["format_authors_for_citations"] = format_authors_for_citations +import re +def regex_replace(s, find, replace): + """A non-optimal implementation of a regex filter""" + return re.sub(find, replace, s) +FILTERS["regex_replace"] = regex_replace + from urllib.parse import unquote import ast @@ -269,10 +282,10 @@ def format_authors_for_citations(value): def publication_details(sources): sources = unquote(sources) - sources = ast.literal_eval(sources) + sources = ast.literal_eval(sources) for source in sources: doi = source['doi'] - + publication = openalex_publications.get_publication(doi="https://doi.org/"+doi) response = make_response(render_template('publication-details.html', publication=publication)) @@ -282,8 +295,8 @@ def publication_details(sources): @app.route('/publication-details-references/', methods=['GET']) @utils.timeit def publication_details_references(doi): - print("doi:", doi) - + print("doi:", doi) + publication = crossref.get_publication(doi=doi) response = make_response(render_template('partials/publication-details/references.html', publication=publication)) @@ -293,7 +306,7 @@ def publication_details_references(doi): @app.route('/publication-details-recommendations/', methods=['GET']) @utils.timeit def publication_details_recommendations(doi): - print("DOI:", doi) + print("DOI:", doi) publications = semanticscholar.get_recommendations_for_publication(doi=doi) response = make_response(render_template('partials/publication-details/recommendations.html', publications=publications)) print("response:", response) @@ -302,7 +315,7 @@ def publication_details_recommendations(doi): @app.route('/publication-details-citations/', methods=['GET']) @utils.timeit def publication_details_citations(doi): - print("DOI:", doi) + print("DOI:", doi) publications = semanticscholar.get_citations_for_publication(doi=doi) response = make_response(render_template('partials/publication-details/citations.html', publications=publications)) print("response:", response) @@ -312,10 +325,10 @@ def publication_details_citations(doi): def resource_details(sources): sources = unquote(sources) - sources = ast.literal_eval(sources) + sources = ast.literal_eval(sources) for source in sources: doi = source['doi'] - + resource = zenodo.get_resource(doi="https://doi.org/"+doi) response = make_response(render_template('resource-details.html', resource=resource)) @@ -323,9 +336,16 @@ def resource_details(sources): return response -@app.route('/researcher-details') -def researcher_details(): - response = make_response(render_template('researcher-details.html')) +@app.route('/researcher-details/', methods=['GET']) +def researcher_details(index): + # index = json.loads(index) + # for result in results['researchers']: + # if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']: + # researcher = result + # break + # logger.info(f'Found researcher {researcher}') + researcher = openalex_researchers.get_researcher_details(index) + response = make_response(render_template('researcher-details.html',researcher=researcher)) # Set search-session cookie to the session cookie value of the first visit if request.cookies.get('search-session') is None: @@ -336,6 +356,19 @@ def researcher_details(): return response +@app.route('/researcher-banner/', methods=['GET']) +def researcher_banner(index): + # logger.info(f'Fetching details for researcher with index {index}') + for result in results['researchers']: + if result.list_index == index: + researcher = result + break + # logger.info(f'Found researcher {researcher}') + researcher = openalex_researchers.get_researcher_banner(researcher) + if researcher.banner == "": + return jsonify() + return jsonify(imageUrl = f'data:image/jpeg;base64,{researcher.banner}') + @app.route('/organization-details//', methods=['GET']) def organization_details(organization_id, organization_name): diff --git a/objects.py b/objects.py index 058bb68..571d2f8 100644 --- a/objects.py +++ b/objects.py @@ -8,7 +8,7 @@ class thing: description: str = "" url: str = "" image: str = "" #url of the image - identifier: str = "" #doi or pid will be stored as identifier + identifier: str = "" #doi or pid will be stored as identifier originalSource: str = "" source: list() = field(default_factory=list) # this list will have "thing" objects rankScore: float = 0 #bm25 ranking score for sorting the search results @@ -18,7 +18,7 @@ def __str__(self): strValue = "" for field in fields(self): # print(field.type) - # concatenate all the property values + # concatenate all the property values strValue += f"{getattr(self, field.name)}###" return strValue @@ -54,15 +54,15 @@ class Person(thing): nationality: str = "" # we can later link it to country #this should be a list workLocation: str = "" #this should be a list worksFor: Organization = None #this should be a list - -Organization.founder = List[Person] + +Organization.founder = List[Person] # Organization.funder = Union[Organization(), Person()] Organization.parentOrganization = Organization() @dataclass class Author(Person): - # orcid: str = "" # we should not have this attribute; orcid should be kept in + # orcid: str = "" # we should not have this attribute; orcid should be kept in works_count: str = "" cited_by_count: str = "" @@ -92,7 +92,7 @@ class CreativeWork(thing): datePublished: str = "" encoding_contentUrl: Dict[str, str] = field(default_factory=dict) encodingFormat: str = "" - funder: Union[Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 + funder: Union[Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 funding: str = "" # we can change this to Grant genre: str = "" headline: str = "" @@ -119,9 +119,9 @@ class CreativeWork(thing): isDerivedFrom: List[Union[str, str]] = field(default_factory=list) documents: List[Union[str, str]] = field(default_factory=list) - + @dataclass -class Article(CreativeWork): +class Article(CreativeWork): articleBody: str = "" pageEnd: str = "" pageStart: str = "" @@ -132,13 +132,24 @@ class Article(CreativeWork): JournalVolume: str = "" @dataclass -class Dataset(CreativeWork): +class Dataset(CreativeWork): distribution: str = "" issn: str = "" +@dataclass +class Author(Person): + orcid: str = "" # we should not have this attribute; orcid should be kept in + works_count: str = "" + about: str = "" + banner: str = "" + cited_by_count: str = "" + url: str = "" + researchAreas: List[str] = field(default_factory=list) + works: List[Union[Article, Dataset]] = field(default_factory=list) + #The 'Project' is a new addition to schema.org, and as of now, there are no defined properties for it @dataclass -class Project(Organization): +class Project(Organization): dateStart: str = "" dateEnd: str = "" dateLastModified : str = "" @@ -157,7 +168,7 @@ class SoftwareApplication(CreativeWork): issn: str = "" softwareVersion: str = "" @dataclass -class LearningResource(CreativeWork): +class LearningResource(CreativeWork): assesses: str = "" #The item being described is intended to assess the competency or learning outcome defined by the referenced term. competencyRequired: str = "" educationalAlignment:str = "" @@ -167,7 +178,7 @@ class LearningResource(CreativeWork): teaches:str = "" #The item being described is intended to help a person learn the competency or learning outcome defined by the referenced term. @dataclass -class MediaObject(CreativeWork): +class MediaObject(CreativeWork): associatedArticle: str = "" bitrate: str = "" contentSize: str = "" @@ -187,9 +198,9 @@ class MediaObject(CreativeWork): startTime: str = "" uploadDate: str = "" width: str = "" - + @dataclass -class VideoObject(MediaObject): +class VideoObject(MediaObject): actor: str = "" caption: str = "" director: str = "" @@ -199,21 +210,21 @@ class VideoObject(MediaObject): videoFrameSize: str = "" videoQuality: str = "" @dataclass -class ImageObject(MediaObject): +class ImageObject(MediaObject): caption: str = "" embeddedTextCaption: str = "" exifData: str = "" #exif data for this object representativeOfPage: str = "" #Indicates whether this image is representative of the content of the page @dataclass -class Place(thing): +class Place(thing): additionalProperty: str = "" address: str = "" addressType: str = "" aggregateRating: str = "" amenityFeature: str = "" branchCode: str = "" - containedInPlace: str = "" + containedInPlace: str = "" containsPlace : str = "" event: str = "" faxNumber: str = "" @@ -228,17 +239,17 @@ class Place(thing): geoOverlaps: str = "" geoTouches: str = "" geoWithin: str = "" - globalLocationNumber: str = "" + globalLocationNumber: str = "" hasDriveThroughService: str = "" hasMap: str = "" - isAccessibleForFree: str = "" + isAccessibleForFree: str = "" isicV4: str = "" keywords: str = "" latitude: str = "" licence: str = "" logo: str = "" longitude: str = "" - maximumAttendeeCapacity: str = "" + maximumAttendeeCapacity: str = "" openingHoursSpecification: str = "" photo: str = "" placType: str = "" @@ -347,7 +358,7 @@ class Lesson: description: str date: str - + @dataclass class Publisher: id: str @@ -374,7 +385,7 @@ class Funder: class Gesis: resource_type: str url: str - date: str + date: str title: str description: str authors: str @@ -384,7 +395,7 @@ class Gesis: class Cordis: id: str url: str - date: str + date: str title: str description: str diff --git a/requirements.txt b/requirements.txt index 315ff0f..d3ab798 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -# gradio~=3.9.1 lxml==5.1.0 flask==2.3.2 extruct~=0.14.0 @@ -7,6 +6,7 @@ diophila~=0.4.0 requests==2.31.0 Wikipedia>=1.4.0 beautifulsoup4>=2.4.1 +numpy==1.26.3 pandas>=2.0.1 elg~=0.5.0 # not compatible with pydantic V2 gunicorn==20.1.0 @@ -18,3 +18,5 @@ xmltodict dateparser>=1.2.0 Flask-Session==0.5.0 rank_bm25==0.2.2 +python-dotenv==1.0.1 +openai==1.35.3 \ No newline at end of file diff --git a/sources/data_retriever.py b/sources/data_retriever.py index a621dfb..3c2f57c 100644 --- a/sources/data_retriever.py +++ b/sources/data_retriever.py @@ -22,7 +22,7 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): 'Content-Type': 'application/json', 'User-Agent': utils.config["request_header_user_agent"] } - print("url:", url) + # print("url:", url) response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) @@ -51,7 +51,7 @@ def retrieve_single_object(source: str, base_url: str, doi: str): try: doi = urllib.parse.quote_plus(string=doi, safe='()?&=,') url = base_url + doi - print('url:', url) + # print('url:', url) headers = {'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': utils.config["request_header_user_agent"] diff --git a/sources/dblp_researchers.py b/sources/dblp_researchers.py index b578494..06ee368 100644 --- a/sources/dblp_researchers.py +++ b/sources/dblp_researchers.py @@ -29,7 +29,6 @@ def search(search_term: str, results): if int(total_hits) > 0: hits = hits['hit'] - for hit in hits: author = Author() @@ -65,7 +64,6 @@ def search(search_term: str, results): _source.url = info.get("url", "") author.source.append(_source) - results['researchers'].append(author) except requests.exceptions.Timeout as ex: diff --git a/sources/gepris.py b/sources/gepris.py index feeea5e..3f59270 100644 --- a/sources/gepris.py +++ b/sources/gepris.py @@ -2,7 +2,7 @@ from objects import Gepris import logging from bs4 import BeautifulSoup -from objects import Project, Person, Organization, Place +from objects import Project, Person, Organization, Place, Author, thing import utils logger = logging.getLogger('nfdi_search_engine') @@ -126,19 +126,18 @@ def find_author(search_term, results): if aurhtors_element: authors = aurhtors_element.find_all("div", class_=["eintrag_alternate","eintrag"]) - for author in authors: try: - authorObj = Person() - authorObj.source = 'GEPRIS' - authorObj.identifier = author.find("a")["href"] - authorObj.url = f'https://gepris.dfg.de{authorObj.identifier}' + authorObj = Author() + authorObj.source.append(thing(name='GEPRIS', identifier=author.find("a")["href"], url=f'https://gepris.dfg.de{authorObj.identifier}')) + # authorObj.identifier = author.find("a")["href"] + # authorObj.url = f'https://gepris.dfg.de{authorObj.identifier}' author_names = author.find("h2").text.strip() if "," in author_names: authorObj.name = author_names.replace(",", " ") - authorObj.affiliation = ' '.join(author.find("div", class_="beschreibung").find_all(string=True, recursive=False)).strip() - + for inst in author.find("div", class_="beschreibung").find_all(string=True, recursive=False): + authorObj.affiliation.append(Organization(name=inst)) results['researchers'].append(authorObj) except KeyError: diff --git a/sources/openalex_researchers.py b/sources/openalex_researchers.py index 6da4e5f..8c26583 100644 --- a/sources/openalex_researchers.py +++ b/sources/openalex_researchers.py @@ -1,10 +1,11 @@ import requests -from objects import thing, Article, Author +from objects import thing, Article, Author, Organization import logging import utils from sources import data_retriever import traceback - +from openai import OpenAI +import json # logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) logger = logging.getLogger('nfdi_search_engine') @@ -15,60 +16,322 @@ def generate_string_from_keys(dictionary): @utils.timeit def search(search_term: str, results): - + source = "OPENALEX Researchers" try: - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=utils.config["search_url_openalex_researchers"], search_term=search_term, results=results) total_records_found = search_result['meta']['count'] hits = search_result.get("results", []) total_hits = len(hits) - logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + + if int(total_hits) > 0: + for hit in hits: + + author = Author() + # info = hit.get('info',{}) + author.orcid = hit.get("ids", {}).get("orcid", "") + author.name = hit.get('display_name', '') + alias = hit.get('display_name_alternatives', {}) + if isinstance(alias, str): + author.alternateName.append(alias) + if isinstance(alias, list): + for _alias in alias: + author.alternateName.append(_alias) + + affiliations = hit.get('affiliations', {}) + if isinstance(affiliations, list): + for affiliation in affiliations: + institution = affiliation.get('institution', {}) + if isinstance(institution, dict): + _organization = Organization() + _organization.name = institution.get('display_name', '') + years = affiliation.get('years', []) + if(len(years) > 1): _organization.keywords.append(f'{years[-1]}-{years[0]}') + else: _organization.keywords.append(f'{years[0]}') + author.affiliation.append(_organization) + + # topics = hit.get('topics', {}) + # if isinstance(topics, list): + # for topic in topics: + # name = topic.get('display_name', '') + # author.researchAreas.append(name) + # topics = hit.get('topic_share', {}) + # if isinstance(topics, list): + # for topic in topics: + # name = topic.get('display_name', '') + # author.researchAreas.append(name) + topics = hit.get('x_concepts', {}) + if isinstance(topics, list): + for topic in topics: + name = topic.get('display_name', '') + author.researchAreas.append(name) + + author.works_count = hit.get('works_count', '') + author.cited_by_count = hit.get('cited_by_count', '') + + _source = thing() + _source.name = 'OPENALEX' + _source.identifier = hit.get("ids", {}).get("openalex", "").replace('https://openalex.org/','') + author.source.append(_source) + + search_result_semantic = data_retriever.retrieve_data(source=source, + base_url="https://api.semanticscholar.org/graph/v1/author/search?fields=name,url,externalIds,paperCount,citationCount&query=", + search_term= author.name.replace(" ", "+"), + results={}) + semantic_hits = search_result_semantic.get("data", []) + for semantic_hit in semantic_hits: + if semantic_hit.get("externalIds", {}).get("ORCID", "") == author.orcid.replace('https://orcid.org/', ''): + author.works_count = semantic_hit.get('paperCount', '') + author.cited_by_count = semantic_hit.get('citationCount', '') + semanticId = semantic_hit.get("authorId", "") + _source = thing() + _source.name = 'SEMANITCSCHOLAR' + _source.identifier = semanticId + _source.url = semantic_hit.get("url", "") + author.source.append(_source) + break + + results['researchers'].append(author) + + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append(source) + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) + +def convert_to_string(value): + if isinstance(value, list): + return ", ".join(convert_to_string(item) for item in value if item not in ("", [], {}, None)) + elif hasattr(value, '__dict__'): # Check if the value is an instance of a class + details = vars(value) + return ", ".join(f"{key}: {convert_to_string(val)}" for key, val in details.items() if val not in ("", [], {}, None)) + return str(value) + + +def get_researcher_details(url): + + source = "Researcher" + url = json.loads(url) + + try: + hit = data_retriever.retrieve_data(source=source, + base_url="https://api.openalex.org/authors/", + search_term=url[0]['sid'], + results={}) + + researcher = Author() + researcher.url = json.dumps(url) + researcher.orcid = hit.get("ids", {}).get("orcid", "") + researcher.name = hit.get('display_name', '') + alias = hit.get('display_name_alternatives', {}) + if isinstance(alias, str): + researcher.alternateName.append(alias) + if isinstance(alias, list): + for _alias in alias: + researcher.alternateName.append(_alias) + + affiliations = hit.get('affiliations', {}) + if isinstance(affiliations, list): + for affiliation in affiliations: + institution = affiliation.get('institution', {}) + if isinstance(institution, dict): + _organization = Organization() + _organization.name = institution.get('display_name', '') + years = affiliation.get('years', []) + if(len(years) > 1): _organization.keywords.append(f'{years[-1]}-{years[0]}') + else: _organization.keywords.append(f'{years[0]}') + researcher.affiliation.append(_organization) + + topics = hit.get('topics', {}) + if isinstance(topics, list): + for topic in topics: + name = topic.get('display_name', '') + researcher.researchAreas.append(name) + # topics = hit.get('topic_share', {}) + # if isinstance(topics, list): + # for topic in topics: + # name = topic.get('display_name', '') + # researcher.researchAreas.append(name) + # topics = hit.get('x_concepts', {}) + # if isinstance(topics, list): + # for topic in topics: + # name = topic.get('display_name', '') + # researcher.researchAreas.append(name) - if int(total_hits) > 0: + _source = thing() + _source.name = 'OPENALEX' + _source.identifier = hit.get("ids", {}).get("openalex", "").replace('https://openalex.org/','') + researcher.source.append(_source) + + + ##### uncomment to search openalex for publications... + # search_result = data_retriever.retrieve_data(source=source, + # base_url="https://api.openalex.org/works?filter=author.id:", + # search_term=researcher.source[0].identifier, + # results={}) + # total_records_found = search_result['meta']['count'] + # hits = search_result.get("results", []) + # total_hits = len(hits) + # logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + # if int(total_hits) > 0: + # for hit in hits: + + # publication = Article() + + # publication.name = utils.remove_html_tags(hit.get("title", "")) + # publication.url = hit.get("id", "") # not a valid url, openalex is currently working on their web interface. + # publication.identifier = hit.get("doi", "").replace("https://doi.org/", "") + # publication.datePublished = hit.get("publication_date", "") + # publication.inLanguage.append(hit.get("language", "")) + # publication.license = hit.get("primary_location", {}).get("license", "") + # # publication.publication = hit.get("primary_location", {}).get("source", {}).get("display_name", "") + + # abstract_inverted_index = hit.get("abstract_inverted_index", {}) + # publication.description = generate_string_from_keys(abstract_inverted_index) # Generate the string using keys from the dictionary + # publication.abstract = publication.description + + # authorships = hit.get("authorships", []) + # for authorship in authorships: + + # authors = authorship.get("author", {}) + + # _author = Author() + # _author.type = 'Person' + # _author.name = authors.get("display_name", "") + # _author.identifier = authors.get("orcid", "") + # publication.author.append(_author) + + # # getattr(publication, "source").clear() + # _source = thing() + # _source.name = 'OPENALEX' + # _source.identifier = hit.get("id", "").replace("https://openalex.org/", "") # remove the base url and only keep the ID + # _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. + # publication.source.append(_source) + + # researcher.works.append(publication) + + + # search semantic scholar... + search_result = data_retriever.retrieve_data(source=source, + base_url="https://api.semanticscholar.org/graph/v1/author/search?fields=name,url,externalIds,paperCount,citationCount&query=", + search_term= researcher.name.replace(" ", "+"), + results={}) + hits = search_result.get("data", []) + for hit in hits: + if hit.get("externalIds", {}).get("ORCID", "") == researcher.orcid.replace('https://orcid.org/', ''): + researcher.works_count = hit.get('paperCount', '') + researcher.cited_by_count = hit.get('citationCount', '') + semanticId = hit.get("authorId", "") + _source = thing() + _source.name = 'SEMANITCSCHOLAR' + _source.identifier = semanticId + _source.url = hit.get("url", "") + researcher.source.append(_source) + break + search_result = data_retriever.retrieve_data(source=source, + base_url=f'https://api.semanticscholar.org/graph/v1/author/{semanticId}/papers?fields=url,title,venue,year,authors,abstract', + search_term= "", + results={}) + + hits = search_result.get("data", []) + a = 0 + total_hits = len(hits) + if int(total_hits) > 0: for hit in hits: - - publication = Article() - - publication.name = utils.remove_html_tags(hit.get("title", "")) - publication.url = hit.get("id", "") # not a valid url, openalex is currently working on their web interface. - publication.identifier = hit.get("doi", "").replace("https://doi.org/", "") - publication.datePublished = hit.get("publication_date", "") - publication.inLanguage.append(hit.get("language", "")) - publication.license = hit.get("primary_location", {}).get("license", "") + + publication = Article() + + publication.name = utils.remove_html_tags(hit.get("title", "")) + publication.url = hit.get("url", "") + publication.identifier = hit.get("title", "") + publication.description = hit.get("abstract", "") + # publication.identifier = hit.get("doi", "").replace("https://doi.org/", "") + publication.datePublished = hit.get("year", "") + # publication.inLanguage.append(hit.get("language", "")) + # publication.license = hit.get("primary_location", {}).get("license", "") # publication.publication = hit.get("primary_location", {}).get("source", {}).get("display_name", "") - abstract_inverted_index = hit.get("abstract_inverted_index", {}) - publication.description = generate_string_from_keys(abstract_inverted_index) # Generate the string using keys from the dictionary - publication.abstract = publication.description + # abstract_inverted_index = hit.get("abstract_inverted_index", {}) + # publication.description = generate_string_from_keys(abstract_inverted_index) # Generate the string using keys from the dictionary + # publication.abstract = publication.description - authorships = hit.get("authorships", []) + authorships = hit.get("authors", []) for authorship in authorships: - author = authorship.get("author", {}) + # authors = authorship.get("author", {}) _author = Author() _author.type = 'Person' - _author.name = author.get("display_name", "") - _author.identifier = author.get("orcid", "") + _author.name = authorship.get("name", "") + # _author.identifier = authors.get("orcid", "") publication.author.append(_author) # getattr(publication, "source").clear() _source = thing() - _source.name = 'OPENALEX' - _source.identifier = hit.get("id", "").replace("https://openalex.org/", "") # remove the base url and only keep the ID - _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. + _source.name = 'SEMANTICSCHOLAR' + # _source.identifier = hit.get("id", "").replace("https://openalex.org/", "") # remove the base url and only keep the ID + # _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. publication.source.append(_source) - results['publications'].append(publication) - - except requests.exceptions.Timeout as ex: - logger.error(f'Timed out Exception: {str(ex)}') - results['timedout_sources'].append(source) - + researcher.works.append(publication) + a+=1 + + ### uncomment to generate about section + logger.info(f'Getting publications {a}') + details = vars(researcher) + # Convert the details into a string format + details_str = "\n".join(f"{key}: {convert_to_string(value)}" for key, value in details.items() if (value not in ("", [], {}, None) and key not in ("works", "source","orcid"))) + prompt = f"Generate a 2-3 line 'About' section for a researcher based on the following details:\n{details_str}" + client = OpenAI( + api_key=utils.env_config["OPENAI_API_KEY"], + ) + logger.info('sent message to openai') + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": f'{prompt}', + } + ], + model="gpt-3.5-turbo", + ) + # about_section = response.choices[0].text.strip() + researcher.about = chat_completion.choices[0].message.content.strip() + except Exception as ex: logger.error(f'Exception: {str(ex)}') - logger.error(traceback.format_exc()) \ No newline at end of file + logger.error(traceback.format_exc()) + + return researcher + +def get_researcher_banner(researcher: Author): + try: + details = vars(researcher) + details_str = "\n".join(f"{convert_to_string(value)}" for key, value in details.items() if (value not in ("", [], {}, None) and key in ("researchAreas"))) + prompt = f"A banner for researcher with following research areas:\n{researcher.about}" + client = OpenAI( + api_key=utils.env_config["OPENAI_API_KEY"], + ) + response = client.images.generate( + model="dall-e-2", + prompt=prompt, + size="512x512", + quality="standard", + response_format="b64_json", + n=1, + ) + researcher.banner = response.data[0].b64_json + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) + + return researcher \ No newline at end of file diff --git a/sources/orcid.py b/sources/orcid.py index 21fbcae..0eaa93e 100644 --- a/sources/orcid.py +++ b/sources/orcid.py @@ -1,6 +1,6 @@ import requests import logging -from objects import Person, Author +from objects import Person, Author, thing, Organization import utils logger = logging.getLogger('nfdi_search_engine') @@ -9,10 +9,10 @@ def search(search_term: str, results): try: - + base_url = utils.config["search_url_orcid"] url = base_url + '"' + search_term.replace(' ', '+') + '"' - + headers = {'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': utils.config["request_header_user_agent"] @@ -30,19 +30,18 @@ def search(search_term: str, results): authors = search_result.get('expanded-result', None) if authors: for author in authors: - + authorObj = Author() - authorObj.source = 'ORCID' + # authorObj.source = 'ORCID' + authorObj.source.append(thing(name='ORCID')) given_names = author.get('given-names', '') family_names = author.get('family-names', '') authorObj.name = given_names + " " + family_names authorObj.orcid = author.get('orcid-id', '') - last_known_institution = author.get('institution-name', {}) - if last_known_institution: - authorObj.affiliation = last_known_institution[-1] - else: - authorObj.affiliation = '' + institution = author.get('institution-name', []) + for inst in institution: + authorObj.affiliation.append(Organization(name=inst)) authorObj.works_count = '' authorObj.cited_by_count = '' @@ -51,7 +50,7 @@ def search(search_term: str, results): except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append('ORCID') - + except Exception as ex: logger.error(f'Exception: {str(ex)}') @@ -80,11 +79,11 @@ def get_orcid_access_token(): else: print("Failed to obtain access token:", response.text) return None - + # Function to search for public information from ORCID def old_search(search_term, results): # It also can be used for retrieving further information, logging in, edite records etc - access_token = '45d5a287-de76-4a62-8ab9-1ffc046e7cde' + access_token = '45d5a287-de76-4a62-8ab9-1ffc046e7cde' headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', @@ -106,7 +105,7 @@ def old_search(search_term, results): # Check if the response contains any search results if 'result' in json_data and isinstance(json_data['result'], list) and json_data['result']: # Iterate through the first 10 results for now (can be changed) - for result in json_data['result'][:10]: + for result in json_data['result'][:10]: orcid_id = result['orcid-identifier']['path'] # Generate the URL to the person's public profile in ORCID @@ -119,7 +118,7 @@ def old_search(search_term, results): if response.status_code == 200: # Extract the JSON response json_data = response.json() - + # Extract the name information name_data = json_data.get('name', {}) given_names = name_data.get('given-names', {}).get('value', '') @@ -153,7 +152,7 @@ def old_search(search_term, results): external_identifier_type = external_identifier.get('external-id-type', '') external_identifier_value = external_identifier.get('external-id-value', '') external_identifier_values.append((external_identifier_type, external_identifier_value)) - + affiliations = json_data.get('employments', {}).get('employment-summary', []) if affiliations: for affiliation in affiliations: @@ -193,7 +192,7 @@ def old_search(search_term, results): else: print("Failed to search for public data:", response.text) - logger.info(f'Got {len(results)} records from Orcid') - + logger.info(f'Got {len(results)} records from Orcid') + except requests.exceptions.RequestException as e: print("An error occurred during the request:", str(e)) \ No newline at end of file diff --git a/sources/wikidata_researchers.py b/sources/wikidata_researchers.py index 882adb7..89bec10 100644 --- a/sources/wikidata_researchers.py +++ b/sources/wikidata_researchers.py @@ -1,5 +1,5 @@ import requests -from objects import thing, Article, Author +from objects import thing, Article, Author, Organization import logging import utils from sources import data_retriever @@ -13,7 +13,7 @@ @utils.timeit def search(search_term: str, results): - + source = "WIKIDATA Researchers" try: @@ -25,89 +25,74 @@ def search(search_term: str, results): SERVICE wikibase:mwapi { bd:serviceParam wikibase:endpoint "www.wikidata.org"; - wikibase:api "EntitySearch"; + wikibase:api "EntitySearch"; mwapi:search "$search_string"; mwapi:language "en"; mwapi:limit "150". ?item wikibase:apiOutputItem mwapi:item. } - #?item (wdt:P279*/wdt:P31) wd:Q482980 . + ?item wdt:P106 ?occ . ?occ wdt:P279* wd:Q1650915 . OPTIONAL {?item wdt:P496 ?orcid .} OPTIONAL {?item wdt:P27 ?nationality.} OPTIONAL {?item wdt:P735 ?givenName.} - OPTIONAL {?item wdt:P734 ?familyName.} + OPTIONAL {?item wdt:P734 ?familyName.} OPTIONAL { ?item p:P108 ?st. ?st ps:P108 ?employer. ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) ?st pq:P580 ?date. - MINUS {?st pq:P582 ?enddate.} + MINUS {?st pq:P582 ?enddate.} } - OPTIONAL {?item wdt:P108 ?employer. + OPTIONAL {?item wdt:P108 ?employer. ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) - } + } SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } - GROUP by ?item ?itemLabel ?orcid ?nationalityLabel ?givenNameLabel ?familyNameLabel - - ''') + GROUP by ?item ?itemLabel ?orcid ?nationalityLabel ?givenNameLabel ?familyNameLabel + + ''') query = query_template.substitute(search_string=search_term) - query = ' '.join(query.split()) + query = ' '.join(query.split()) - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=utils.config["search_url_wikidata"], search_term=query, results=results) - - hits = search_result.get("results", {}).get("bindings", []) + + hits = search_result.get("results", {}).get("bindings", []) total_hits = len(hits) - logger.info(f'{source} - {total_hits} hits found') + logger.info(f'{source} - {total_hits} hits found') - if int(total_hits) > 0: + if int(total_hits) > 0: for hit in hits: - - # this block should be updated to researchers - - publication = Article() - - publication.name = hit.get("label", {}).get("value","") - publication.url = hit.get("item", {}).get("value","") - publication.identifier = "" #DOI is available for few; we need to update the sparql query to fetch this information - publication.datePublished = datetime.strftime(parser.parse(hit.get('date', {}).get('value', "")), '%Y-%m-%d') - - authorsLabels = hit.get("authorsLabel", {}).get("value","") - for authorsLabel in authorsLabels.rstrip(",").split(","): - _author = Author() - _author.type = 'Person' - _author.name = authorsLabel - _author.identifier = "" #ORCID is available for few; we need to update the sparql query to pull this information - publication.author.append(_author) - - authorsStrings = hit.get("authorsString", {}).get("value","") - for authorsString in authorsStrings.rstrip(",").split(","): - _author = Author() - _author.type = 'Person' - _author.name = authorsString - _author.identifier = "" - publication.author.append(_author) - - _source = thing() - _source.name = 'WIKIDATA' - _source.identifier = hit['item'].get('value', "").replace("http://www.wikidata.org/", "") # remove the base url and only keep the ID - _source.url = hit['item'].get('value', "") - publication.source.append(_source) - - results['publications'].append(publication) - + + author = Author() + # info = hit.get('info',{}) + author.orcid = hit.get("orcid", {}).get("value", "") + author.name = hit.get('itemLabel', '').get('value', '') + affiliations = hit.get('employerSampleLabel', {}) + if isinstance(affiliations, dict): + author.affiliation.append(Organization(name = affiliations.get('value', {}))) + author.works_count = '' + author.cited_by_count = '' + + _source = thing() + _source.name = 'WIKIDATA' + _source.identifier = hit.get("ids", {}).get("openalex", "") + _source.url = hit.get("item", {}).get("value", "") + author.source.append(_source) + + results['researchers'].append(author) + except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append(source) - + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/static/images/sources/semantic-scholar.png b/static/images/sources/semantic-scholar.png new file mode 100644 index 0000000000000000000000000000000000000000..e19faacefd3042f302f83cb7ab6a7fdf3411f1fb GIT binary patch literal 612 zcmV-q0-ODbP)P000sQ1^@s6t}b<;00009a7bBm000XT z000XT0n*)m`~Uy|0drDELIAGL9O(c60trb(K~zXfoz+RmPEi;K@ZU$FObey-m7#oP zA%zT4%8(E#85V{@8A2$Ou?#CD8%eSFM1+NGkTNV}fl^AWWMRsJP@)Xy(Yf{Zdf)rr z&#l|N_nhZE|MUF+|L5E`|6WUQ0c{QD%7)%x7=F|T6^Y`x5Iu1RyRkk8|HffF!7s#j z?K9SH!c@$`^=2S3z6z&t8w>ChgRvb)TU6Bp+b|GY%0Oc77`A4RJ~)ewI8l+h1F6b= zjX*YFDPChWZYHpP7>j3ki^&y1f{jb5s;{;2!Drl|!G2uCK|IEa^gSYvwR{#p( z2jEeTjoj_E4qBYt(V5(_D`^l^g^=s;B2^S~mn(ooNa*qfi?T_s)!TGhE_W8LBoj=* z-Tc0z0!aTv?^=vO=o54|?1kyroOHN_?+D=DiNbj$ATb{{UXx;v;8eEx-zCi&ZW%sc zcCzFr%*W^kAlX5e#`3W}I2 y|BMhmlE)b~F`WYMx&lyzt|DJp^hO5jDuzGK!(Bciu`|X10000
@@ -68,168 +71,39 @@ class="bi bi-bookmark">
--> {% if publication.identifier != '' %}
+ data-bs-target="#share-publication-{{publication.identifier | regex_replace ('\W','') }}"> +
{% endif %} {% if publication.image != '' %}
{% endif %} {% if publication.encoding_contentUrl != '' %}
+ data-bs-toggle="modal" + data-bs-target="#download-publication-{{publication.identifier | regex_replace ('\W','') }}">
{% endif %}
- - - - - + {% with share_modal_id='share-publication-'+publication.identifier | regex_replace ('\W',''), + preview_modal_id='preview-publication-'+publication.identifier | regex_replace ('\W',''), + download_modal_id='download-publication-'+publication.identifier | regex_replace ('\W',''), + url = '/publication-details/'+ publication | format_digital_obj_url, + img_src = publication.image, + encoding_contentUrl = publication.encoding_contentUrl, + title = publication.name | trim + %} + {{ shareModal(share_modal_id, url, title) }} + {{ previewModal(preview_modal_id, title, img_src) }} + {{ downloadModal(download_modal_id, title, encoding_contentUrl) }} + {% endwith %}
@@ -247,4 +121,15 @@
-
\ No newline at end of file +
+ + \ No newline at end of file diff --git a/templates/components/researchers.html b/templates/components/researchers.html index 5860b65..81a87f0 100644 --- a/templates/components/researchers.html +++ b/templates/components/researchers.html @@ -9,7 +9,8 @@
@@ -83,4 +84,15 @@ more researchers {% endif %}
-
\ No newline at end of file +
+ + \ No newline at end of file diff --git a/templates/partials/common/download-modal.html b/templates/partials/common/download-modal.html new file mode 100644 index 0000000..39ca63e --- /dev/null +++ b/templates/partials/common/download-modal.html @@ -0,0 +1,70 @@ +{% macro downloadModal(modal_id, title, encoding_contentUrl) %} + + + + + + +{% endmacro %} \ No newline at end of file diff --git a/templates/partials/common/preview-modal.html b/templates/partials/common/preview-modal.html new file mode 100644 index 0000000..da5fc64 --- /dev/null +++ b/templates/partials/common/preview-modal.html @@ -0,0 +1,20 @@ +{% macro previewModal(modal_id, title, img_src) %} + + +{% endmacro %} \ No newline at end of file diff --git a/templates/partials/common/share-modal.html b/templates/partials/common/share-modal.html new file mode 100644 index 0000000..769a5dd --- /dev/null +++ b/templates/partials/common/share-modal.html @@ -0,0 +1,127 @@ +{% macro shareModal(modal_id, url, title) %} + + + + +{% endmacro %} \ No newline at end of file diff --git a/templates/researcher-details.html b/templates/researcher-details.html index dccb7a6..e85a522 100644 --- a/templates/researcher-details.html +++ b/templates/researcher-details.html @@ -1,3 +1,6 @@ +{% from 'partials/common/share-modal.html' import shareModal %} +{% from 'partials/common/preview-modal.html' import previewModal %} +{% from 'partials/common/download-modal.html' import downloadModal %} {% extends "layouts/base.html" %} {% block title %} Publication Details {% endblock title %} @@ -5,8 +8,7 @@ {% block stylesheets %} - - + {% endblock stylesheets %} {% block content %} @@ -14,81 +16,75 @@
-
+ +
@@ -297,6 +347,18 @@

Dr. rer. nat. (AKSW)

}); + // JavaScript to fetch and update the banner after page load + // document.addEventListener("DOMContentLoaded", function() { + // fetch('/researcher-banner/{{researcher.list_index}}') // Endpoint to generate the image + // .then(response => response.json()) + // .then(data => { + // if (data.imageUrl) { + // document.querySelector('.banner').style.backgroundImage = `url(${data.imageUrl})`; + // } + // }) + // .catch(error => console.error('Error fetching image:', error)); + // }); + diff --git a/templates/results.html b/templates/results.html index a5c7f1d..40cc598 100644 --- a/templates/results.html +++ b/templates/results.html @@ -422,44 +422,6 @@ }); - - - function base64Encode(str) { - var CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - var out = "", i = 0, len = str.length, c1, c2, c3; - while (i < len) { - c1 = str.charCodeAt(i++) & 0xff; - if (i == len) { - out += CHARS.charAt(c1 >> 2); - out += CHARS.charAt((c1 & 0x3) << 4); - out += "=="; - break; - } - c2 = str.charCodeAt(i++); - if (i == len) { - out += CHARS.charAt(c1 >> 2); - out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4)); - out += CHARS.charAt((c2 & 0xF) << 2); - out += "="; - break; - } - c3 = str.charCodeAt(i++); - out += CHARS.charAt(c1 >> 2); - out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4)); - out += CHARS.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6)); - out += CHARS.charAt(c3 & 0x3F); - } - return out; - } - - function getBinary(file) { - var xhr = new XMLHttpRequest(); - xhr.open("GET", file, false); - xhr.overrideMimeType("text/plain; charset=x-user-defined"); - xhr.send(null); - return xhr.responseText; - } - function load_more_publications() { $('#div_load_more_publications').remove() diff --git a/utils.py b/utils.py index 11ebfa6..acea665 100644 --- a/utils.py +++ b/utils.py @@ -1,3 +1,4 @@ +import os import extruct from objects import Article, Person, Author import wikipedia @@ -9,6 +10,15 @@ config = yaml.load(f, Loader=yaml.FullLoader) +#load environment variables +from dotenv import find_dotenv, load_dotenv +_ = load_dotenv(find_dotenv()) +env_config = dict( + { + "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY", ""), + } +) + #region DECORATORS From 405511e71080c1c81a26ccb9864bb78407fda956 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Tue, 9 Jul 2024 11:42:18 +0530 Subject: [PATCH 6/9] solved errors --- main.py | 68 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/main.py b/main.py index 4811edf..f5bc4d4 100644 --- a/main.py +++ b/main.py @@ -122,25 +122,25 @@ def search_results(): # Chatbot - push search results to chatbot server for embeddings generation if (utils.config['chatbot_feature_enable']): - if (utils.config['chatbot_feature_enable']): - - # Convert a UUID to a 32-character hexadecimal string - search_uuid = uuid.uuid4().hex - session['search_uuid'] = search_uuid - - def send_search_results_to_chatbot(search_uuid: str): - print('request is about to start') - chatbot_server = utils.config['chatbot_server'] - save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings'] - request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}' - response = requests.post(request_url, json=json.dumps(results, default=vars)) - response.raise_for_status() - print('request completed') - - # create a new daemon thread - chatbot_thread = threading.Thread(target=send_search_results_to_chatbot, args=(search_uuid,), daemon=True) - # start the new thread - chatbot_thread.start() + if (utils.config['chatbot_feature_enable']): + + # Convert a UUID to a 32-character hexadecimal string + search_uuid = uuid.uuid4().hex + session['search_uuid'] = search_uuid + + def send_search_results_to_chatbot(search_uuid: str): + print('request is about to start') + chatbot_server = utils.config['chatbot_server'] + save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings'] + request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}' + response = requests.post(request_url, json=json.dumps(results, default=vars)) + response.raise_for_status() + print('request completed') + + # create a new daemon thread + chatbot_thread = threading.Thread(target=send_search_results_to_chatbot, args=(search_uuid,), daemon=True) + # start the new thread + chatbot_thread.start() # sleep(1) @@ -216,21 +216,21 @@ def are_embeddings_generated(): #Check the embeddings readiness only if the chatbot feature is enabled otherwise return False if (utils.config['chatbot_feature_enable']): - if (utils.config['chatbot_feature_enable']): - print('are_embeddings_generated') - uuid = session['search_uuid'] - chatbot_server = utils.config['chatbot_server'] - are_embeddings_generated = utils.config['endpoint_are_embeddings_generated'] - request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}" - headers = { - 'Content-Type': 'application/json' - } - response = requests.request("GET", request_url, headers=headers) - json_response = response.json() - print('json_response:', json_response) - return str(json_response['file_exists']) - else: - return str(True) + if (utils.config['chatbot_feature_enable']): + print('are_embeddings_generated') + uuid = session['search_uuid'] + chatbot_server = utils.config['chatbot_server'] + are_embeddings_generated = utils.config['endpoint_are_embeddings_generated'] + request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}" + headers = { + 'Content-Type': 'application/json' + } + response = requests.request("GET", request_url, headers=headers) + json_response = response.json() + print('json_response:', json_response) + return str(json_response['file_exists']) + else: + return str(True) @app.route('/get-chatbot-answer', methods=['GET']) def get_chatbot_answer(): From 2104a38637af4283e130620c76876c652a6b6b43 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Tue, 9 Jul 2024 18:05:50 +0530 Subject: [PATCH 7/9] Resources details page almost completed --- .github/workflows/main.yml | 8 +-- a.py | 62 +++++++++++++++++++ main.py | 44 +++---------- requirements.txt | 2 +- sources/data_retriever.py | 16 ++--- sources/openalex_researchers.py | 4 +- sources/zenodo.py | 106 +++++++++++++++++--------------- templates/resource-details.html | 50 ++++++++++----- utils.py | 16 +++-- 9 files changed, 179 insertions(+), 129 deletions(-) create mode 100644 a.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a730a93..f159ac8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,22 +7,18 @@ on: jobs: deploy: - runs-on: self-hosted + runs-on: self-hosted steps: - name: 'Check out repo' uses: actions/checkout@v3 with: - ref: main + ref: main - name: 'Stop the running NFDI Search Engine' run: docker compose down - name: 'Delete old Docker image' run: docker image rm nfdi-search-engine-search-engine - name: 'Copy logging.conf' -<<<<<<< HEAD run: cp logging.conf.example logging.conf -======= - run: cp logging.conf.example logging.conf ->>>>>>> origin/develop - name: 'Create .env' run: | echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> ./.env diff --git a/a.py b/a.py new file mode 100644 index 0000000..e6d92b0 --- /dev/null +++ b/a.py @@ -0,0 +1,62 @@ +import urllib.parse +import requests +import logging +import utils +import json +from sources import data_retriever +from objects import CreativeWork, Author + +base_url = "https://zenodo.org/api/records?size=25&q=" +doi = "4701615" + +encoded_doi = urllib.parse.quote_plus(string=doi, safe='()?&=,') +url = base_url + encoded_doi +print(url) +headers = { + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'User-Agent': utils.config["request_header_user_agent"] +} + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +try: + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + logger.debug(f'Response status code: {response.status_code}') + + if response.status_code == 200: + search_results = response.json() + search_results = utils.clean_json(search_results) + # print(search_result) + search_result = search_results.get("hits", {}).get("hits", []) + search_result = search_result[0] + metadata = search_result.get("metadata", {}) + resource = CreativeWork() + resource.name = metadata.get("title", "") + resource.url = metadata.get('links', {}).get('self', '') + resource.identifier = metadata.get("doi", "") + resource.datePublished = metadata.get("publication_date", "") + resource.inLanguage.append(metadata.get("language", "")) + resource.license = metadata.get("license", "") + + resource.description = utils.remove_html_tags(metadata.get("description", "")) + resource.abstract = resource.description + + authors = metadata.get("creators", []) + for author in authors: + _author = Author() + _author.type = 'Person' + _author.name = author.get("name", "") + _author.identifier = author.get("orcid", "") + _author.affiliation = author.get("affiliation", "") + resource.author.append(_author) + # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + # print(json.dumps(search_result, indent=4)) + # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + print(resource.author[0].name) + else: + logger.error(f'Failed to retrieve data: {response.status_code}') + +except requests.exceptions.RequestException as e: + logger.error(f'An error occurred: {e}') diff --git a/main.py b/main.py index f5bc4d4..e5a5ba5 100644 --- a/main.py +++ b/main.py @@ -273,9 +273,7 @@ def format_digital_obj_url(value): source_dict['sname'] = source else: source_dict['sname'] = source.name - source_dict['sid'] = value.identifier - source_dict['sname'] = source.name - source_dict['sid'] = source.identifier + source_dict['sid'] = source.identifier sources_list.append(source_dict) return json.dumps(sources_list) FILTERS["format_digital_obj_url"] = format_digital_obj_url @@ -353,32 +351,20 @@ def resource_details(sources): sources = ast.literal_eval(sources) for source in sources: doi = source['doi'] - - resource = zenodo.get_resource(doi="https://doi.org/"+doi) + resource = zenodo.get_resource(doi) response = make_response(render_template('resource-details.html', resource=resource)) print("response:", response) return response - -@app.route('/researcher-details/', methods=['GET']) -def researcher_details(index): - # index = json.loads(index) - # for result in results['researchers']: - # if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']: - # researcher = result - # break - # logger.info(f'Found researcher {researcher}') - researcher = openalex_researchers.get_researcher_details(index) - response = make_response(render_template('researcher-details.html',researcher=researcher)) @app.route('/researcher-details/', methods=['GET']) def researcher_details(index): - # index = json.loads(index) - # for result in results['researchers']: - # if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']: - # researcher = result - # break - # logger.info(f'Found researcher {researcher}') + index = json.loads(index) + for result in results['researchers']: + if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']: + researcher = result + break + logger.info(f'Found researcher {researcher}') researcher = openalex_researchers.get_researcher_details(index) response = make_response(render_template('researcher-details.html',researcher=researcher)) @@ -404,20 +390,6 @@ def researcher_banner(index): return jsonify() return jsonify(imageUrl = f'data:image/jpeg;base64,{researcher.banner}') -@app.route('/researcher-banner/', methods=['GET']) -def researcher_banner(index): - # logger.info(f'Fetching details for researcher with index {index}') - for result in results['researchers']: - if result.list_index == index: - researcher = result - break - # logger.info(f'Found researcher {researcher}') - researcher = openalex_researchers.get_researcher_banner(researcher) - if researcher.banner == "": - return jsonify() - return jsonify(imageUrl = f'data:image/jpeg;base64,{researcher.banner}') - - @app.route('/organization-details//', methods=['GET']) def organization_details(organization_id, organization_name): try: diff --git a/requirements.txt b/requirements.txt index d3ab798..b97b892 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ dateparser>=1.2.0 Flask-Session==0.5.0 rank_bm25==0.2.2 python-dotenv==1.0.1 -openai==1.35.3 \ No newline at end of file +==1.35.3 \ No newline at end of file diff --git a/sources/data_retriever.py b/sources/data_retriever.py index 3c2f57c..28596cc 100644 --- a/sources/data_retriever.py +++ b/sources/data_retriever.py @@ -8,7 +8,7 @@ logger = logging.getLogger('nfdi_search_engine') def retrieve_data(source: str, base_url: str, search_term: str, results): - + try: search_term = urllib.parse.quote_plus(string=search_term, safe='()?&=,') @@ -23,8 +23,8 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): 'User-Agent': utils.config["request_header_user_agent"] } # print("url:", url) - - response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) logger.debug(f'{source} response status code: {response.status_code}') logger.debug(f'{source} response headers: {response.headers}') @@ -36,7 +36,7 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): #clean the json response; remove all the keys which don't have any value search_result = utils.clean_json(search_result) - return search_result + return search_result else: logger.error(f'Response status code: {str(response.status_code)}') @@ -47,8 +47,8 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): raise ex def retrieve_single_object(source: str, base_url: str, doi: str): - - try: + + try: doi = urllib.parse.quote_plus(string=doi, safe='()?&=,') url = base_url + doi # print('url:', url) @@ -56,7 +56,7 @@ def retrieve_single_object(source: str, base_url: str, doi: str): 'Content-Type': 'application/json', 'User-Agent': utils.config["request_header_user_agent"] } - response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) logger.debug(f'{source} response status code: {response.status_code}') # logger.debug(f'{source} response headers: {response.headers}') @@ -68,7 +68,7 @@ def retrieve_single_object(source: str, base_url: str, doi: str): #clean the json response; remove all the keys which don't have any value search_result = utils.clean_json(search_result) - return search_result + return search_result else: logger.error(f'Response status code: {str(response.status_code)}') diff --git a/sources/openalex_researchers.py b/sources/openalex_researchers.py index b83f0a1..6e4e53d 100644 --- a/sources/openalex_researchers.py +++ b/sources/openalex_researchers.py @@ -310,7 +310,7 @@ def get_researcher_details(url): ], model="gpt-3.5-turbo", ) - # about_section = response.choices[0].text.strip() + about_section = response.choices[0].text.strip() researcher.about = chat_completion.choices[0].message.content.strip() except Exception as ex: @@ -391,5 +391,3 @@ def get_researcher_banner(researcher: Author): logger.error(traceback.format_exc()) return researcher - - return researcher \ No newline at end of file diff --git a/sources/zenodo.py b/sources/zenodo.py index a83974c..efe39e4 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -14,61 +14,61 @@ def search(search_term, results): source = "Zenodo" try: - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=utils.config["search_url_zenodo"], search_term=search_term, - results=results) + results=results) total_records_found = search_result.get("hits", {}).get("total", 0) hits = search_result.get("hits", {}).get("hits", []) total_hits = len(hits) - logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') if int(total_hits) > 0: for hit in hits: - + metadata = hit.get('metadata', {}) resource_type = metadata.get('resource_type', {}).get('type','OTHER').upper() if resource_type == 'PUBLICATION': - digitalObj = Article() + digitalObj = Article() elif resource_type in ['PRESENTATION', 'POSTER']: - digitalObj = CreativeWork() + digitalObj = CreativeWork() elif resource_type == 'DATASET': - digitalObj = Dataset() + digitalObj = Dataset() elif resource_type == 'VIDEO': - digitalObj = VideoObject() + digitalObj = VideoObject() elif resource_type == 'IMAGE': - digitalObj = ImageObject() + digitalObj = ImageObject() elif resource_type == 'LESSON': - digitalObj = LearningResource() + digitalObj = LearningResource() elif resource_type == 'SOFTWARE': - digitalObj = SoftwareApplication() + digitalObj = SoftwareApplication() elif resource_type == 'OTHER': - digitalObj = CreativeWork() + digitalObj = CreativeWork() else: print('This resource type is still not defined:', resource_type) digitalObj = CreativeWork() - + digitalObj.identifier = hit.get('doi', '') digitalObj.name = hit.get('title', '') - digitalObj.url = hit.get('links', {}).get('self', '') + digitalObj.url = hit.get('links', {}).get('self', '') digitalObj.genre = resource_type digitalObj.description = utils.remove_html_tags(metadata.get('description', '')) - + keywords = metadata.get('keywords', []) if isinstance(keywords, list): for keyword in keywords: terms = [term.strip() for term in keyword.split(",")] - digitalObj.keywords.extend(terms) - + digitalObj.keywords.extend(terms) + language = metadata.get('language', '') digitalObj.inLanguage.append(language) digitalObj.dateCreated = hit.get('created','') - digitalObj.dateModified = hit.get('modified','') + digitalObj.dateModified = hit.get('modified','') digitalObj.datePublished = metadata.get('resource_date', '') - digitalObj.license = metadata.get('license', {}).get('id', '') - digitalObj.creativeWorkStatus = hit.get('status','') + digitalObj.license = metadata.get('license', {}).get('id', '') + digitalObj.creativeWorkStatus = hit.get('status','') digitalObj.funder = metadata.get('grants', [{}])[0].get('funder', {}).get('name', '') digitalObj.conditionsOfAccess = metadata.get('access-rights','') if(digitalObj.conditionsOfAccess == ''): @@ -92,7 +92,7 @@ def search(search_term, results): # for related_identifier in related_identifiers: # relation = related_identifier.get('relation', '').lower() # identifier = related_identifier.get('identifier', '') - + # if relation == 'iscitedby': # digitalObj.isCitedBy.append(identifier) # elif relation == 'issupplementto': @@ -114,18 +114,18 @@ def search(search_term, results): # elif relation == 'haspart': # digitalObj.hasPart.append(identifier) - authors = metadata.get("creators", []) + authors = metadata.get("creators", []) for author in authors: _author = Author() _author.type = 'Person' _author.name = author.get("name", "") _author.identifier = author.get("orcid", "") _author.affiliation = author.get("affiliation", "") - digitalObj.author.append(_author) + digitalObj.author.append(_author) Stats = hit.get('stats', '') _stats = Statistics() - + _stats.downloads = Stats.get("downloads", '') _stats.unique_downloads = Stats.get("unique_downloads", '') _stats.views = Stats.get("views", '') @@ -134,23 +134,23 @@ def search(search_term, results): _stats.version_unique_downloads = Stats.get("version_unique_downloads", '') _stats.version_unique_views = Stats.get("version_unique_views", '') _stats.version_views = Stats.get("version_views", '') - - digitalObj.stats = _stats - - contributors = metadata.get("contributors", []) + + digitalObj.stats = _stats + + contributors = metadata.get("contributors", []) for contributor in contributors: _contributor = Author() _contributor.type = 'Person' _contributor.name = contributor.get("name", "") _contributor.identifier = contributor.get("orcid", "") _contributor.affiliation = contributor.get("affiliation", "") - digitalObj.contributor.append(_contributor) + digitalObj.contributor.append(_contributor) _source = thing() _source.name = source _source.identifier = hit.get("id", "") - _source.url = hit.get('links', {}).get('self_html', '') - digitalObj.source.append(_source) + _source.url = hit.get('links', {}).get('self_html', '') + digitalObj.source.append(_source) files = hit.get('files', []) @@ -177,44 +177,50 @@ def search(search_term, results): digitalObj.Journal = journal_info.get('title', '') digitalObj.JournalVolume = journal_info.get('volume', '') digitalObj.issue = journal_info.get('issue', '') - + results['publication'].append(digitalObj) - elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: - results['resources'].append(digitalObj) + elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: + results['resources'].append(digitalObj) else: - results['others'].append(digitalObj) + results['others'].append(digitalObj) except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append(source) - + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) - + @utils.timeit def get_resource(doi: str): - + source = "Zenodo" + start_index = doi.find("zenodo.") + len("zenodo.") + if(start_index): + numeric_string = doi[start_index:] + else: + numeric_string = doi try: - search_result = data_retriever.retrieve_single_object(source=source, + search_results = data_retriever.retrieve_single_object(source=source, base_url=utils.config["search_url_zenodo"], - doi=doi) - + doi=numeric_string) + search_result = search_results.get("hits", {}).get("hits", []) + search_result = search_result[0] metadata = search_result.get('metadata', {}) - resource = CreativeWork() - resource.name = search_result.get("title", "") - resource.url = search_result.get('links', {}).get('self', '') + resource = CreativeWork() + resource.name = search_result.get("title", "") + resource.url = search_result.get('links', {}).get('self', '') resource.identifier = search_result.get("doi", "") - resource.datePublished = metadata.get("publication_date", "") + resource.datePublished = metadata.get("publication_date", "") resource.inLanguage.append(metadata.get("language", "")) resource.license = metadata.get("license", "") resource.description = utils.remove_html_tags(metadata.get("description", "")) resource.abstract = resource.description - authors = search_result.get("creators", []) + authors = metadata.get("creators", []) for author in authors: _author = Author() _author.type = 'Person' @@ -227,13 +233,13 @@ def get_resource(doi: str): if isinstance(keywords, list): for keyword in keywords: terms = [term.strip() for term in keyword.split(",")] - resource.keywords.extend(terms) - + resource.keywords.extend(terms) + return resource except requests.exceptions.Timeout as ex: - logger.error(f'Timed out Exception: {str(ex)}') - + logger.error(f'Timed out Exception: {str(ex)}') + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/templates/resource-details.html b/templates/resource-details.html index 7a605c1..3b3534b 100644 --- a/templates/resource-details.html +++ b/templates/resource-details.html @@ -80,23 +80,29 @@

{{resource.name}}

-
AUTHORS ({{resource.author|length}})
+
AUTHORS ({{ resource.author|length }})
- {% for author in resource.author %} - - {% endfor %} + {% if resource.author|length > 0 %} + {% for author in resource.author %} + + {% endfor %} + {% else %} +
+

No authors available.

+
+ {% endif %}
ABSTRACT
+ data-bs-placement="top" title="Retrieved from Zenodo">
{{ resource.description}} @@ -115,7 +121,7 @@
SUPPLEMENTAL MATERIAL
REFERENCES
+ data-bs-placement="top" title="Retrieved from CrossRef">
@@ -131,8 +137,8 @@
CITATIONS
  1. - Presse- und Informationsamt der Bundesregierung (2019). - Women and Politics. GESIS Data Archive, Cologne. ZA6719 + Presse- und Informationsamt der Bundesregierung (2019). + Women and Politics. GESIS Data Archive, Cologne. ZA6719 Data file Version 1.0.0, https://doi.org/10.4232/1.13220.
  2. @@ -159,8 +165,22 @@
    CITED BY
    --> + +
    +
    -
    +
    RECOMMENDATIONS
    @@ -199,8 +219,6 @@
    RECOMMENDATIONS
    --> -
    -
    FAIR Assessment
    diff --git a/utils.py b/utils.py index acea665..8cfc495 100644 --- a/utils.py +++ b/utils.py @@ -137,11 +137,11 @@ def parse_date(date_str): except (TypeError, ValueError): print(f"original date str: {date_str}") return "" - + # def sort_results_publications(results): -# def custom_sort_key(obj): -# desc = getattr(obj, 'description', '') -# pub_date = getattr(obj, 'datePublished', '0000-00-00') +# def custom_sort_key(obj): +# desc = getattr(obj, 'description', '') +# pub_date = getattr(obj, 'datePublished', '0000-00-00') # if desc == '': # return (0, pub_date) # return (1, pub_date) @@ -153,10 +153,10 @@ def sort_search_results(search_term, search_results): tokenized_results = [str(result).lower().split(" ") for result in search_results] if len(tokenized_results) > 0: bm25 = BM25Plus(tokenized_results) - + tokenized_query = search_term.lower().split(" ") doc_scores = bm25.get_scores(tokenized_query) - + for idx, doc_score in enumerate(doc_scores): search_results[idx].rankScore = doc_score @@ -168,6 +168,4 @@ def split_authors(authors_names, seperator, authors_list): _author = Author() _author.type = 'Person' _author.name = author - authors_list.append(_author) - - \ No newline at end of file + authors_list.append(_author) From fbed53e10bef07a2bd362c4c6f642d5b1fe6ed56 Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Mon, 15 Jul 2024 13:35:47 +0530 Subject: [PATCH 8/9] resources details complete --- .github/workflows/main.yml | 8 +- a.py | 10 +- main.py | 42 +++++-- requirements.txt | 2 +- sources/dblp.py | 126 +++++++++++++++++++ sources/openaire.py | 181 +++++++++++++++++++++++++++ sources/openalex.py | 215 ++++++++++++++++++++++++++++++++ sources/openalex_researchers.py | 5 +- sources/wikidata.py | 171 +++++++++++++++++++++++++ sources/zenodo.py | 11 +- templates/resource-details.html | 13 +- 11 files changed, 762 insertions(+), 22 deletions(-) create mode 100644 sources/dblp.py create mode 100644 sources/openaire.py create mode 100644 sources/openalex.py create mode 100644 sources/wikidata.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f159ac8..a730a93 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,18 +7,22 @@ on: jobs: deploy: - runs-on: self-hosted + runs-on: self-hosted steps: - name: 'Check out repo' uses: actions/checkout@v3 with: - ref: main + ref: main - name: 'Stop the running NFDI Search Engine' run: docker compose down - name: 'Delete old Docker image' run: docker image rm nfdi-search-engine-search-engine - name: 'Copy logging.conf' +<<<<<<< HEAD run: cp logging.conf.example logging.conf +======= + run: cp logging.conf.example logging.conf +>>>>>>> origin/develop - name: 'Create .env' run: | echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> ./.env diff --git a/a.py b/a.py index e6d92b0..96ee045 100644 --- a/a.py +++ b/a.py @@ -7,7 +7,11 @@ from objects import CreativeWork, Author base_url = "https://zenodo.org/api/records?size=25&q=" -doi = "4701615" +doi = "r3730f562f9e::324df2bd7d05a0942f31f0fe34e2eefa" + +# search_result = data_retriever.retrieve_single_object(source=source, +# base_url= +# doi=doi) encoded_doi = urllib.parse.quote_plus(string=doi, safe='()?&=,') url = base_url + encoded_doi @@ -52,9 +56,9 @@ _author.affiliation = author.get("affiliation", "") resource.author.append(_author) # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") - # print(json.dumps(search_result, indent=4)) + print(json.dumps(search_result, indent=4)) # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") - print(resource.author[0].name) + # print( resource.name) else: logger.error(f'Failed to retrieve data: {response.status_code}') diff --git a/main.py b/main.py index e5a5ba5..a490346 100644 --- a/main.py +++ b/main.py @@ -57,12 +57,12 @@ @app.route('/') def index(): - if (utils.env_config["OPENAI_API_KEY"] == ""): - return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) + # if (utils.env_config["OPENAI_API_KEY"] == ""): + # return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) - if (utils.env_config["OPENAI_API_KEY"] == ""): - return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) + # if (utils.env_config["OPENAI_API_KEY"] == ""): + # return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) response = make_response(render_template('index.html')) @@ -99,8 +99,7 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - eudat, openaire_products, re3data, orkg, openalex_researchers] - # sources = [openalex_publications] + eudat, eulg, openaire_products, re3data, orkg, openalex_researchers] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() @@ -351,12 +350,41 @@ def resource_details(sources): sources = ast.literal_eval(sources) for source in sources: doi = source['doi'] - resource = zenodo.get_resource(doi) + resource = zenodo.get_resource(doi="https://doi.org/"+doi) response = make_response(render_template('resource-details.html', resource=resource)) print("response:", response) return response +@app.route('/resource-details-citations/', methods=['GET']) +@utils.timeit +def resource_details_citations(doi): + print("DOI:", doi) + resource = semanticscholar.get_citations_for_publication(doi=doi) + response = make_response(render_template('partials/publication-details/citations.html', resource=resource)) + print("response:", response) + return response + +@app.route('/resource-details-references/', methods=['GET']) +@utils.timeit +def resource_details_references(doi): + print("doi:", doi) + + resource = crossref.get_publication(doi=doi) + response = make_response(render_template('partials/publication-details/references.html', resource=resource)) + + print("response:", response) + return response + +@app.route('/resource-details-recommendations/', methods=['GET']) +@utils.timeit +def resource_details_recommendations(doi): + print("DOI:", doi) + publications = semanticscholar.get_recommendations_for_publication(doi=doi) + response = make_response(render_template('partials/publication-details/recommendations.html', publications=publications)) + print("response:", response) + return response + @app.route('/researcher-details/', methods=['GET']) def researcher_details(index): index = json.loads(index) diff --git a/requirements.txt b/requirements.txt index b97b892..d3ab798 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,4 @@ dateparser>=1.2.0 Flask-Session==0.5.0 rank_bm25==0.2.2 python-dotenv==1.0.1 -==1.35.3 \ No newline at end of file +openai==1.35.3 \ No newline at end of file diff --git a/sources/dblp.py b/sources/dblp.py new file mode 100644 index 0000000..0562ac6 --- /dev/null +++ b/sources/dblp.py @@ -0,0 +1,126 @@ +import extruct +import requests +from objects import Person, Article +import logging +import os +import pprint +import utils +# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) +logger = logging.getLogger('nfdi_search_engine') + + +def extract_metadata(text: bytes) -> object: + """Extract all metadata present in the page and return a dictionary of metadata lists. + + Args: + text: The content of a requests.get( ) call + + Returns: + metadata (dict): Dictionary of json-ld, microdata, and opengraph lists. + Each of the lists present within the dictionary contains multiple dictionaries. + """ + metadata = extruct.extract(text, + uniform=True, + syntaxes=['json-ld', + 'microdata', + 'opengraph']) + assert isinstance(metadata, object) + return metadata + + +@utils.timeit +# def dblp(search_term: str, g, results): +def search(search_term: str, results): + + try: + + base_url = utils.config["search_url_dblp"] + url = base_url + search_term + + headers = {'Accept': 'application/json', + 'Content-Type': 'application/json', + 'User-Agent': utils.config["request_header_user_agent"] + } + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + + logger.debug(f'DBLP response status code: {response.status_code}') + logger.debug(f'DBLP response headers: {response.headers}') + + # TODO unclear why here are only a few but now all results returned + + metadata = extract_metadata(response.content) + # TODO unclear why this loop takes so long + #The profiler indicates that the JSON-LD parsing process is responsible for the majority of the execution time, taking approximately 18.21 seconds. + # + # I.e. the JSON-LD parsing takes that long + for data in metadata['microdata']: + if data['@type'] == 'Person': + ''' + results.append( + Person( + name=data["name"], + url=data["url"], + affiliation="" + ) + ) + ''' + elif data['@type'] == 'ScholarlyArticle': + if 'author' in data: + url = '' + if 'url' in data: + if type(data["url"]) == list: + url = ', '.join(data["url"]) + else: + url = data["url"] + publication = Article() + publication.source = 'DBLP' + publication.name = data["name"] + publication.url = url + publication.image = data["image"] + publication.description = '' + publication.abstract = '' + publication.keywords.append('') + publication.inLanguage.append("") + publication.datePublished = data["datePublished"] + publication.license = '' + author = Person() + author.type = 'Person' + if type(data["author"]) == list: + #author = ', '.join([authors["name"] for authors in data["author"]]) + for authors in data["author"]: + author2 = Person() + author2.name = authors["name"] + author2.type = 'Person' + publication.author.append(author2) + elif type(data["author"]) == dict: + author.name = data["author"]["name"] + publication.author.append(author) + else: + author.name = data["author"] + publication.author.append(author) + publication.encoding_contentUrl = '' + publication.encodingFormat = '' + + results['publications'].append(publication) + ''' + results.append( + Article( + title=data["name"], + url=url, + authors=author, + description='', + date=data["datePublished"] + ) + ) + ''' + logger.info(f"Got {len(results)} Researchers and scholarly articls from DBLP") + # return results + # g.parse(data=json.dumps(data), format='json-ld') + # logger.info(f"Graph g has {len(g)} statements after querying DBLP.") + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('DBLP') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') \ No newline at end of file diff --git a/sources/openaire.py b/sources/openaire.py new file mode 100644 index 0000000..1fd62c3 --- /dev/null +++ b/sources/openaire.py @@ -0,0 +1,181 @@ +import requests +import utils +from objects import Dataset, Author, Article, CreativeWork, Organization, Project +import logging + +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_string: str, results): + """ Obtain the results from Openaire request and handles them accordingly. + + Args: + search_string: keyword(s) to search for + results: search answer formatted into different data types according to Openaire result_types + and mapped to schema.org types. + + Returns: + the results Object + """ + openaire_product_search(search_string, results) + openaire_project_search(search_string, results) + + logger.info(f"Got {len(results)} records from Openaire") + return results + + +def openaire_product_search(search_string, results): + + try: + api_url = 'https://api.openaire.eu/search/researchProducts' + response = requests.get(api_url, + params={"keywords": search_string, "format": "json", "size": 20}, + timeout=int(utils.config["request_timeout"])) + data = response.json() + logger.debug(f'Openaire product search response status code: {response.status_code}') + logger.debug(f'Openaire product search response headers: {response.headers}') + + # hits = data.get('response', {}).get('results', {}).get('result', []) + if response.status_code == 200: + try: + hits = data.get('response', {}).get('results', {}).get('result', []) + except AttributeError: + hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError + + for hit in hits: + pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:result', {}) + result_type = pro_result.get('resulttype', {}).get('@classid', 'other') + # check result type to create an Object of the right Class + if result_type == 'publication': + product = Article() + elif result_type == 'dataset': + product = Dataset() + else: + product = CreativeWork() + + product.source = 'Openaire' + collectedfrom = pro_result.get('collectedfrom', None) + if collectedfrom: + product.originalSource = collectedfrom.get('@name', None) + + product.genre = result_type + date = pro_result.get('dateofacceptance', None) + if date: + product.datePublished = date['$'] + + # title can be dict or list. If list, there are 'main title' and 'alternate title' + if type(pro_result.get('title')) is dict: + product.name = pro_result.get('title', {}).get('$', '') + elif type(pro_result.get('title')) is list: + for item in pro_result.get('title'): + if item['@classid'] == 'main title': + product.name = item['$'] + + # description can be dict or list + if type(pro_result.get('description')) is dict: + product.description = utils.remove_html_tags(pro_result.get('description', {}).get('$', '')) + elif type(pro_result.get('description')) is list: + product.description = utils.remove_html_tags(pro_result.get('description')[0].get('$', '')) + else: + product.description = '' + + # Language can be set or "und" = Undetermined + product.inLanguage = [] if pro_result.get('language', {}).get('@classid', '') == 'und' else [pro_result.get( + 'language', {}).get('@classid', '')] + + # pid can be dict or list + if type(pro_result.get('pid')) is dict: + product.identifier = pro_result.get('pid', {}).get('$', '') + elif type(pro_result.get('pid')) is list: + product.identifier = pro_result.get('pid', {})[0].get('$', '') + else: + product.identifier = '' + + # Creators can be dict, list, None + # creators = pro_result.get('creator', {}) if pro_result.get('creator') is not None else {} + creators = pro_result.get('creator', None) + if type(creators) is dict: + creator = Author() + creator.type = 'Person' + creator.name = creators.get('$', '') + product.author.append(creator) + elif type(creators) is list: + for item in creators: + creator = Author() + creator.type = 'Person' + creator.name = item.get('$', '') + product.author.append(creator) + + # Check genre to add result to right category + if product.genre == 'publication': + results['publications'].append(product) + elif product.genre == 'dataset' or product.genre == 'software': + results['resources'].append(product) + else: + results['others'].append(product) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENAIRE') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def openaire_project_search(search_string, results): + + try: + api_url = 'https://api.openaire.eu/search/projects' + response = requests.get(api_url, + params={"name": search_string, "format": "json", "size": 20}, + timeout=int(utils.config["request_timeout"])) + data = response.json() + logger.debug(f'Openaire project search response status code: {response.status_code}') + logger.debug(f'Openaire project search response headers: {response.headers}') + + if response.status_code == 200: + try: + hits = data.get('response', {}).get('results', {}).get('result', []) + except AttributeError: + hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError + + for hit in hits: + pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:project', {}) + project = Project() + project.source = 'Openaire' + project.name = pro_result.get('title', {}).get('$', '') + project.dateStart = pro_result.get('startdate', {}).get('$', '') + project.dateEnd = pro_result.get('enddate', {}).get('$', '') + project.identifier = pro_result.get('callidentifier', {}).get('$', '') + + # fundingtree can be dict or list + # fundingtree = pro_result.get('fundingtree', {}) if pro_result.get('fundingtree') is not None else {} + fundingtree = pro_result.get('fundingtree', None) + if type(fundingtree) is dict: + orga = Organization() + orga.name = fundingtree.get('name', {}).get('$', '') + project.funder.append(orga) + elif type(fundingtree) is list: + for item in fundingtree: + orga = Organization() + orga.name = item.get('name', {}).get('$', '') + project.funder.append(orga) + + # "rels" can be None, dict, list + relations = pro_result.get('rels', {}).get('rel', {}) if pro_result.get('rels', {}) is not None else [] + if type(relations) is dict: + relations = [relations] + + # This need a review. Type 'Organization' ? + for rel in relations: + author_obj = Author() + author_obj.type = 'Organization' + author_obj.name = (rel.get('legalname', {}).get('$', '')) + project.author.append(author_obj) + results['others'].append(project) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENAIRE') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') \ No newline at end of file diff --git a/sources/openalex.py b/sources/openalex.py new file mode 100644 index 0000000..e1aede0 --- /dev/null +++ b/sources/openalex.py @@ -0,0 +1,215 @@ +import requests +import logging +from objects import Person, Author, Article, Institute, Funder, Publisher +import utils + +# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_key: str, results): + find_works(search_key, results) + find_authors(search_key, results) + # find_institute(search_key, results) + # find_funder(search_key, results) + # find_publisher(search_key, results) + logger.info(f"Got {len(results)} author, publication, and institute records from OpenAlex") + return results + + +def find_authors(search_key, results): + + try: + base_url = utils.config["search_url_openalex_authors"] + headers = {'Accept': 'application/json', + 'Content-Type': 'application/json', + 'User-Agent': utils.config["request_header_user_agent"] + } + response = requests.get(base_url + search_key, headers=headers, timeout=int(utils.config["request_timeout"])) + + if response.status_code == 200: + search_result = response.json() + + records_found = search_result['meta']['count'] + logger.info(f'OpenAlex Authors - {records_found} records found') + + authors = search_result.get('results', None) + if authors: + for author in authors: + authorObj = Author() + authorObj.source = 'OpenAlex' + authorObj.name = author.get('display_name', '') + authorObj.orcid = author.get('orcid', '') + + last_known_institution = author.get('last_known_institution', {}) + if last_known_institution: + authorObj.affiliation = author.get('last_known_institution', {}).get('display_name') + else: + authorObj.affiliation = '' + authorObj.works_count = author.get('works_count', '') + authorObj.cited_by_count = author.get('cited_by_count', '') + + results['researchers'].append(authorObj) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENALEX') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def find_works(search_key, results): + + try: + api_url = "https://api.openalex.org/works?search=" + api_response = requests.get(api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code != 404: + api_data = api_response.json() + for work in api_data['results']: + if 'id' in work: + if work["display_name"] is None \ + or work["id"] is None \ + or work["doi"] is None \ + or work["publication_date"] is None: + continue + publication = Article() + publication.source = 'OpenAlex' + publication.name = utils.remove_html_tags(work["display_name"]) + publication.url = work["doi"] + # publication.image = hit_source.get("image", "") + publication.description = '' + if not work["abstract_inverted_index"] is None: + publication.description = generate_string_from_keys(work["abstract_inverted_index"]) # Generate the string using keys from the dictionary + publication.abstract = '' + keywords = work["concepts"] + if keywords: + for keyword in keywords: + publication.keywords.append(keyword["display_name"]) + + publication.inLanguage.append(str(work["language"])) + publication.datePublished = str(work["publication_date"]) + publication.license = '' + if not work["primary_location"]["license"] is None: + publication.license = work["primary_location"]["license"] + + if len(work["authorships"]) == 1: + author = Person() + author.name = work["authorships"][0]["author"]["display_name"] + author.type = 'Person' + author.identifier = work["id"] + publication.author.append(author) + else: + # authorship = ', '.join( + # current_author["author"]["display_name"] for current_author in work["authorships"]) + for current_author in work["authorships"]: + author = Person() + author.name = current_author["author"]["display_name"] + author.type = 'Person' + author.identifier = current_author["author"]["orcid"] + publication.author.append(author) + + publication.encoding_contentUrl = '' + publication.encodingFormat = '' + + results['publications'].append(publication) + '''' + results.append( + Article( + title=work["display_name"], + url=work["id"], + authors=author, + description='', + date=str(work["publication_year"]) + ) + ) + ''' + + # logger.info(f'Got {len(results)} publication records from OpenAlex') + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENALEX') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def find_institute(search_key, results): + institute_api_url = "https://api.openalex.org/institutions?search=" + api_response = requests.get(institute_api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code != 404: + api_data = api_response.json() + for institute in api_data["results"]: + if 'id' in institute: + institute_acronym = ', '.join( + inst_acronym for inst_acronym in institute["display_name_acronyms"]) + + description = '' + if 'wikipedia' in institute["ids"]: + # institute_wikipedia_link = institute["ids"]["wikipedia"] + description = utils.read_wikipedia(institute["display_name"]) + + institute_country = '' + if 'country' in institute["geo"]: + institute_country = institute["geo"]["country"] + results.append( + Institute( + id=institute["id"], + name=institute["display_name"], + country=institute_country, + institute_type=institute["type"], + acronyms_name=institute_acronym, + homepage_url=institute["homepage_url"], + description=description) + ) + # logger.info(f'Got {len(results)} institute records from OpenAlex') + + +def find_funder(search_key, results): + funder_api_url = "https://api.openalex.org/funders?search=" + api_response = requests.get(funder_api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code == 404: + return + api_data = api_response.json() + for funder in api_data["results"]: + if 'id' in funder: + results.append( + Funder( + id=funder["id"], + name=funder["display_name"], + homepage_url=funder["homepage_url"], + country_code=funder["country_code"], + grants_count=funder["grants_count"], + works_count=funder["works_count"], + description=funder["description"]) + ) + + +def find_publisher(search_key, results): + publisher_api_url = "https://api.openalex.org/publishers?search=" + api_response = requests.get(publisher_api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code == 404: + return + api_data = api_response.json() + for publisher in api_data["results"]: + country_codes = ', '.join( + country_code for country_code in publisher["country_codes"]) + h_index = '' + if 'h_index' in publisher["summary_stats"]: + h_index = publisher["summary_stats"]["h_index"] + if 'id' in publisher: + results.append( + Publisher( + id=publisher["id"], + name=publisher["display_name"], + country_codes=country_codes, + works_count=publisher["works_count"], + homepage_url=publisher['homepage_url'], + h_index=h_index, + description='') + ) + + +def generate_string_from_keys(dictionary): + keys_list = list(dictionary.keys()) + keys_string = " ".join(keys_list) + return keys_string diff --git a/sources/openalex_researchers.py b/sources/openalex_researchers.py index 6e4e53d..6682684 100644 --- a/sources/openalex_researchers.py +++ b/sources/openalex_researchers.py @@ -8,6 +8,7 @@ from openai import OpenAI import json from openai import OpenAI + import json # logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) logger = logging.getLogger('nfdi_search_engine') @@ -311,7 +312,7 @@ def get_researcher_details(url): model="gpt-3.5-turbo", ) about_section = response.choices[0].text.strip() - researcher.about = chat_completion.choices[0].message.content.strip() + # researcher.about = chat_completion.choices[0].message.content.strip() except Exception as ex: logger.error(f'Exception: {str(ex)}') @@ -391,3 +392,5 @@ def get_researcher_banner(researcher: Author): logger.error(traceback.format_exc()) return researcher + + return researcher \ No newline at end of file diff --git a/sources/wikidata.py b/sources/wikidata.py new file mode 100644 index 0000000..54489aa --- /dev/null +++ b/sources/wikidata.py @@ -0,0 +1,171 @@ +import requests +import logging +from objects import Article, Author +from string import Template +from datetime import datetime +from dateutil import parser +import utils + +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_string: str, results): + """ Obtain the results from Wikidata request and handles them accordingly. + + Args: + search_string: keyword(s) to search for + results: search answer are formatted according to schema.org types Article, Author, ... + + Returns: + the results array + """ + wikidata_person_search(search_string, results) + wikidata_article_search(search_string, results) + + logger.info(f"Got {len(results['researchers'])} author and {len(results['publications'])} publication records from Wikidata") + return results + + +def wikidata_article_search(search_string: str, results): + try: + + url = 'https://query.wikidata.org/sparql' + headers = {'User-Agent': 'https://nfdi-search.nliwod.org/'} + query_template = Template(''' + SELECT DISTINCT ?item ?label ?date #(year(?date)as ?dateYear) + (group_concat(DISTINCT ?authorsName; separator=",") as ?authorsLabel) + (group_concat(DISTINCT ?authors2; separator=",") as ?authorsString) + WHERE + { + SERVICE wikibase:mwapi + { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:limit "once"; + wikibase:api "Generator"; + mwapi:generator "search"; + mwapi:gsrsearch "$search_string"; + mwapi:gsrlimit "150". + ?item wikibase:apiOutputItem mwapi:title. + } + ?item rdfs:label ?label. FILTER( LANG(?label)="en" ) + ?item wdt:P31/wdt:P279* wd:Q11826511. + ?item wdt:P577 ?date . + ?item wdt:P50 ?authors. + ?authors rdfs:label ?authorsName . FILTER( LANG(?authorsName)="en" ) + optional {?item wdt:P2093 ?authors2.} + } + GROUP BY ?item ?label ?date + #ORDER BY DESC(?dateYear) + ''') + + response = requests.get(url, + params={'format': 'json', 'query': query_template.substitute(search_string=search_string), + }, headers=headers, timeout=int(utils.config["request_timeout"])) + logger.debug(f'Wikidata article search response status code: {response.status_code}') + logger.debug(f'Wikidata article search response headers: {response.headers}') + + if response.status_code == 200: + data = response.json() + if data["results"]["bindings"]: + for result in data["results"]["bindings"]: + publication = Article() + publication.source = 'Wikidata' + publication.url = result['item'].get('value', "") + publication.name = result['label'].get('value', "") + date_obj = parser.parse(result.get('date', {}).get('value', "")) + date = datetime.strftime(date_obj, '%Y-%m-%d') + publication.datePublished = date # result.get('date', {}).get('value', "") + if result['authorsLabel'].get("value"): + authors_list = result['authorsLabel'].get("value", "").rstrip(",").split(",") + for item in authors_list: + author = Author() + author.name = item + author.type = 'Person' + publication.author.append(author) + if result['authorsString'].get("value"): + authors_list = result['authorsString'].get("value", "").rstrip(",").split(",") + for item in authors_list: + author = Author() + author.name = item + author.type = 'Person' + publication.author.append(author) + results['publications'].append(publication) + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('WIKIDATA') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def wikidata_person_search(search_string: str, results): + try: + url = 'https://query.wikidata.org/sparql' + headers = {'User-Agent': 'https://nfdi-search.nliwod.org/'} + query_template = Template(''' +SELECT DISTINCT ?item ?itemLabel ?orcid (SAMPLE(?employerLabel) as ?employerSampleLabel) ?nationalityLabel ?givenNameLabel ?familyNameLabel + WHERE + { + SERVICE wikibase:mwapi + { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:api "EntitySearch"; + + mwapi:search "$search_string"; + mwapi:language "en"; + mwapi:limit "150". + ?item wikibase:apiOutputItem mwapi:item. + } + #?item (wdt:P279*/wdt:P31) wd:Q482980 . + ?item wdt:P106 ?occ . + ?occ wdt:P279* wd:Q1650915 . + OPTIONAL {?item wdt:P496 ?orcid .} + OPTIONAL {?item wdt:P27 ?nationality.} + OPTIONAL {?item wdt:P735 ?givenName.} + OPTIONAL {?item wdt:P734 ?familyName.} + OPTIONAL { + ?item p:P108 ?st. + ?st ps:P108 ?employer. + ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) + ?st pq:P580 ?date. + MINUS {?st pq:P582 ?enddate.} + } + OPTIONAL {?item wdt:P108 ?employer. + ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) + } + + SERVICE wikibase:label { + bd:serviceParam wikibase:language "en" . + } + } +GROUP by ?item ?itemLabel ?orcid ?nationalityLabel ?givenNameLabel ?familyNameLabel + + ''') + + response = requests.get(url, + params={'format': 'json', 'query': query_template.substitute(search_string=search_string), + }, headers=headers, timeout=int(utils.config["request_timeout"])) + logger.debug(f'Wikidata person search response status code: {response.status_code}') + logger.debug(f'Wikidata person search response headers: {response.headers}') + + if response.status_code == 200: + data = response.json() + if data["results"]["bindings"]: + for result in data["results"]["bindings"]: + author = Author() + author.source = 'Wikidata' + author.url = result['item'].get('value', "") + author.name = result['itemLabel'].get('value', "") + author.givenName = result.get('givenNameLabel', {}).get('value', "") + author.familyName = result.get('familyNameLabel', {}).get('value', "") + author.affiliation = result.get('employerSampleLabel', {}).get('value', "") + author.nationality = result.get('nationalityLabel', {}).get('value', "") + author.orcid = result.get('orcid', {}).get('value', "") + + results['researchers'].append(author) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('WIKIDATA') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') diff --git a/sources/zenodo.py b/sources/zenodo.py index efe39e4..a3a7ebf 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -198,15 +198,15 @@ def get_resource(doi: str): source = "Zenodo" start_index = doi.find("zenodo.") + len("zenodo.") - if(start_index): - numeric_string = doi[start_index:] + if start_index!= -1: + doi = doi[start_index:] else: - numeric_string = doi + doi = doi try: search_results = data_retriever.retrieve_single_object(source=source, base_url=utils.config["search_url_zenodo"], - doi=numeric_string) + doi = doi) search_result = search_results.get("hits", {}).get("hits", []) search_result = search_result[0] metadata = search_result.get('metadata', {}) @@ -217,7 +217,8 @@ def get_resource(doi: str): resource.datePublished = metadata.get("publication_date", "") resource.inLanguage.append(metadata.get("language", "")) resource.license = metadata.get("license", "") - + files = search_result.get('files','') + resource.encoding_contentUrl = {file["key"]: file["links"]["self"] for file in files} resource.description = utils.remove_html_tags(metadata.get("description", "")) resource.abstract = resource.description authors = metadata.get("creators", []) diff --git a/templates/resource-details.html b/templates/resource-details.html index 3b3534b..b7c20e0 100644 --- a/templates/resource-details.html +++ b/templates/resource-details.html @@ -110,15 +110,22 @@
    ABSTRACT
    SUPPLEMENTAL MATERIAL
    + data-bs-placement="top" title="Retrieved from Zenodo">
    +
-
+ + {% for file_name, file_url in resource.encoding_contentUrl.items() %} +
+ {{ file_name }}
+
+ {% endfor %} +
REFERENCES
From 02f55b8e0d235e3eb4f5f3eb0291145b3a431a9a Mon Sep 17 00:00:00 2001 From: Mugdhaa21 Date: Tue, 16 Jul 2024 23:25:43 +0530 Subject: [PATCH 9/9] slight changes --- sources/zenodo.py | 78 +++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/sources/zenodo.py b/sources/zenodo.py index a3a7ebf..f8f9310 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -74,45 +74,45 @@ def search(search_term, results): if(digitalObj.conditionsOfAccess == ''): digitalObj.conditionsOfAccess = metadata.get('access_right','') - # relation_map = { - # 'iscitedby': 'isCitedBy', - # 'issupplementto': 'isSupplementTo', - # 'ispartof': 'isPartOf', - # 'cites': 'cites', - # 'issourceof': 'isSourceOf', - # 'isderivedfrom': 'isDerivedFrom', - # 'issupplementedby': 'isSupplementedBy', - # 'ispreviousversionof': 'isPreviousVersionOf', - # 'documents': 'documents', - # 'haspart': 'hasPart' - # } - - # related_identifiers = metadata.get('related_identifiers', []) - - # for related_identifier in related_identifiers: - # relation = related_identifier.get('relation', '').lower() - # identifier = related_identifier.get('identifier', '') - - # if relation == 'iscitedby': - # digitalObj.isCitedBy.append(identifier) - # elif relation == 'issupplementto': - # digitalObj.isSupplementTo.append(identifier) - # elif relation == 'ispartof': - # digitalObj.isPartOf.append(identifier) - # elif relation == 'cites': - # digitalObj.cites.append(identifier) - # elif relation == 'issourceof': - # digitalObj.isSourceOf.append(identifier) - # elif relation == 'isderivedfrom': - # digitalObj.isDerivedFrom.append(identifier) - # elif relation == 'issupplementedby': - # digitalObj.isSupplementedBy.append(identifier) - # elif relation == 'ispreviousversionof': - # digitalObj.isPreviousVersionOf.append(identifier) - # elif relation == 'documents': - # digitalObj.documents.append(identifier) - # elif relation == 'haspart': - # digitalObj.hasPart.append(identifier) + relation_map = { + 'iscitedby': 'isCitedBy', + 'issupplementto': 'isSupplementTo', + 'ispartof': 'isPartOf', + 'cites': 'cites', + 'issourceof': 'isSourceOf', + 'isderivedfrom': 'isDerivedFrom', + 'issupplementedby': 'isSupplementedBy', + 'ispreviousversionof': 'isPreviousVersionOf', + 'documents': 'documents', + 'haspart': 'hasPart' + } + + related_identifiers = metadata.get('related_identifiers', []) + + for related_identifier in related_identifiers: + relation = related_identifier.get('relation', '').lower() + identifier = related_identifier.get('identifier', '') + + if relation == 'iscitedby': + digitalObj.isCitedBy.append(identifier) + elif relation == 'issupplementto': + digitalObj.isSupplementTo.append(identifier) + elif relation == 'ispartof': + digitalObj.isPartOf.append(identifier) + elif relation == 'cites': + digitalObj.cites.append(identifier) + elif relation == 'issourceof': + digitalObj.isSourceOf.append(identifier) + elif relation == 'isderivedfrom': + digitalObj.isDerivedFrom.append(identifier) + elif relation == 'issupplementedby': + digitalObj.isSupplementedBy.append(identifier) + elif relation == 'ispreviousversionof': + digitalObj.isPreviousVersionOf.append(identifier) + elif relation == 'documents': + digitalObj.documents.append(identifier) + elif relation == 'haspart': + digitalObj.hasPart.append(identifier) authors = metadata.get("creators", []) for author in authors: