diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5d8771c..a730a93 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ name: 'Deploy NFDI Search Engine on sems-kg-1' on: workflow_dispatch: push: - branches: + branches: - main jobs: @@ -18,7 +18,11 @@ jobs: - name: 'Delete old Docker image' run: docker image rm nfdi-search-engine-search-engine - name: 'Copy logging.conf' +<<<<<<< HEAD + run: cp logging.conf.example logging.conf +======= run: cp logging.conf.example logging.conf +>>>>>>> origin/develop - name: 'Create .env' run: | echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> ./.env diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..3093fcd --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "windows-gcc-x86", + "includePath": [ + "${workspaceFolder}/**" + ], + "compilerPath": "E:/make/bin/gcc.exe", + "cStandard": "${default}", + "cppStandard": "${default}", + "intelliSenseMode": "windows-gcc-x86", + "compilerArgs": [ + "" + ] + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..dc0d342 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,62 @@ +{ + "C_Cpp_Runner.cCompilerPath": "gcc", + "C_Cpp_Runner.cppCompilerPath": "g++", + "C_Cpp_Runner.debuggerPath": "gdb", + "C_Cpp_Runner.cStandard": "", + "C_Cpp_Runner.cppStandard": "", + "C_Cpp_Runner.msvcBatchPath": "C:/Program Files/Microsoft Visual Studio/VR_NR/Community/VC/Auxiliary/Build/vcvarsall.bat", + "C_Cpp_Runner.useMsvc": false, + "C_Cpp_Runner.warnings": [ + "-Wall", + "-Wextra", + "-Wpedantic", + "-Wshadow", + "-Wformat=2", + "-Wcast-align", + "-Wconversion", + "-Wsign-conversion", + "-Wnull-dereference" + ], + "C_Cpp_Runner.msvcWarnings": [ + "/W4", + "/permissive-", + "/w14242", + "/w14287", + "/w14296", + "/w14311", + "/w14826", + "/w44062", + "/w44242", + "/w14905", + "/w14906", + "/w14263", + "/w44265", + "/w14928" + ], + "C_Cpp_Runner.enableWarnings": true, + "C_Cpp_Runner.warningsAsError": false, + "C_Cpp_Runner.compilerArgs": [], + "C_Cpp_Runner.linkerArgs": [], + "C_Cpp_Runner.includePaths": [], + "C_Cpp_Runner.includeSearch": [ + "*", + "**/*" + ], + "C_Cpp_Runner.excludeSearch": [ + "**/build", + "**/build/**", + "**/.*", + "**/.*/**", + "**/.vscode", + "**/.vscode/**" + ], + "C_Cpp_Runner.useAddressSanitizer": false, + "C_Cpp_Runner.useUndefinedSanitizer": false, + "C_Cpp_Runner.useLeakSanitizer": false, + "C_Cpp_Runner.showCompilationTime": false, + "C_Cpp_Runner.useLinkTimeOptimization": false, + "C_Cpp_Runner.msvcSecureNoWarnings": false, + "files.associations": { + "iostream": "cpp" + } +} \ No newline at end of file diff --git a/a.py b/a.py new file mode 100644 index 0000000..96ee045 --- /dev/null +++ b/a.py @@ -0,0 +1,66 @@ +import urllib.parse +import requests +import logging +import utils +import json +from sources import data_retriever +from objects import CreativeWork, Author + +base_url = "https://zenodo.org/api/records?size=25&q=" +doi = "r3730f562f9e::324df2bd7d05a0942f31f0fe34e2eefa" + +# search_result = data_retriever.retrieve_single_object(source=source, +# base_url= +# doi=doi) + +encoded_doi = urllib.parse.quote_plus(string=doi, safe='()?&=,') +url = base_url + encoded_doi +print(url) +headers = { + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'User-Agent': utils.config["request_header_user_agent"] +} + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +try: + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + logger.debug(f'Response status code: {response.status_code}') + + if response.status_code == 200: + search_results = response.json() + search_results = utils.clean_json(search_results) + # print(search_result) + search_result = search_results.get("hits", {}).get("hits", []) + search_result = search_result[0] + metadata = search_result.get("metadata", {}) + resource = CreativeWork() + resource.name = metadata.get("title", "") + resource.url = metadata.get('links', {}).get('self', '') + resource.identifier = metadata.get("doi", "") + resource.datePublished = metadata.get("publication_date", "") + resource.inLanguage.append(metadata.get("language", "")) + resource.license = metadata.get("license", "") + + resource.description = utils.remove_html_tags(metadata.get("description", "")) + resource.abstract = resource.description + + authors = metadata.get("creators", []) + for author in authors: + _author = Author() + _author.type = 'Person' + _author.name = author.get("name", "") + _author.identifier = author.get("orcid", "") + _author.affiliation = author.get("affiliation", "") + resource.author.append(_author) + # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + print(json.dumps(search_result, indent=4)) + # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + # print( resource.name) + else: + logger.error(f'Failed to retrieve data: {response.status_code}') + +except requests.exceptions.RequestException as e: + logger.error(f'An error occurred: {e}') diff --git a/main.py b/main.py index 3f7af23..a490346 100644 --- a/main.py +++ b/main.py @@ -32,6 +32,17 @@ app.config["SESSION_TYPE"] = "filesystem" Session(app) +results = { + 'publications': [], + 'researchers': [], + 'resources': [], + 'organizations': [], + 'events': [], + 'fundings': [], + 'others': [], + 'timedout_sources': [] + } + results = { 'publications': [], 'researchers': [], @@ -46,8 +57,12 @@ @app.route('/') def index(): - if (utils.env_config["OPENAI_API_KEY"] == ""): - return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) + # if (utils.env_config["OPENAI_API_KEY"] == ""): + # return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) + + + # if (utils.env_config["OPENAI_API_KEY"] == ""): + # return make_response(render_template('error.html',error_message='Environment variables are not set. Kindly set all the required variables.')) response = make_response(render_template('index.html')) @@ -77,14 +92,14 @@ def search_results(): search_term = request.args.get('txtSearchTerm') session['search-term'] = search_term + for k in results.keys(): results[k] = [] for k in results.keys(): results[k] = [] threads = [] # add all the sources here in this list; for simplicity we should use the exact module name - # ensure the main method which execute the search is named "search" in the module + # ensure the main method which execute the search is named "search" in the module sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee, - eudat, openaire_products, re3data, orkg, openalex_researchers] - # sources = [openalex_publications] + eudat, eulg, openaire_products, re3data, orkg, openalex_researchers] for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() @@ -96,42 +111,43 @@ def search_results(): # deduplicator.convert_publications_to_csv(results["publications"]) # results["publications"] = deduplicator.perform_entity_resolution_publications(results["publications"]) - # sort all the results in each category - results["publications"] = utils.sort_search_results(search_term, results["publications"]) - results["researchers"] = utils.sort_search_results(search_term, results["researchers"]) - + results["publications"] = utils.sort_search_results(search_term, results["publications"]) + results["researchers"] = utils.sort_search_results(search_term, results["researchers"]) + results["resources"] = utils.sort_search_results(search_term, results["resources"]) #store the search results in the session - session['search-results'] = copy.deepcopy(results) + session['search-results'] = copy.deepcopy(results) + # Chatbot - push search results to chatbot server for embeddings generation if (utils.config['chatbot_feature_enable']): - - # Convert a UUID to a 32-character hexadecimal string - search_uuid = uuid.uuid4().hex - session['search_uuid'] = search_uuid - - def send_search_results_to_chatbot(search_uuid: str): - print('request is about to start') - chatbot_server = utils.config['chatbot_server'] - save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings'] - request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}' - response = requests.post(request_url, json=json.dumps(results, default=vars)) - response.raise_for_status() - print('request completed') - - # create a new daemon thread - chatbot_thread = threading.Thread(target=send_search_results_to_chatbot, args=(search_uuid,), daemon=True) - # start the new thread - chatbot_thread.start() + if (utils.config['chatbot_feature_enable']): + + # Convert a UUID to a 32-character hexadecimal string + search_uuid = uuid.uuid4().hex + session['search_uuid'] = search_uuid + + def send_search_results_to_chatbot(search_uuid: str): + print('request is about to start') + chatbot_server = utils.config['chatbot_server'] + save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings'] + request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}' + response = requests.post(request_url, json=json.dumps(results, default=vars)) + response.raise_for_status() + print('request completed') + + # create a new daemon thread + chatbot_thread = threading.Thread(target=send_search_results_to_chatbot, args=(search_uuid,), daemon=True) + # start the new thread + chatbot_thread.start() # sleep(1) - + # on the first page load, only push top 20 records in each category - number_of_records_to_show_on_page_load = int(utils.config["number_of_records_to_show_on_page_load"]) - total_results = {} # the dict to keep the total number of search results + number_of_records_to_show_on_page_load = int(utils.config["number_of_records_to_show_on_page_load"]) + total_results = {} # the dict to keep the total number of search results displayed_results = {} # the dict to keep the total number of search results currently displayed to the user - + for k, v in results.items(): logger.info(f'Got {len(v)} {k}') total_results[k] = len(v) @@ -139,12 +155,12 @@ def send_search_results_to_chatbot(search_uuid: str): displayed_results[k] = len(results[k]) results["timedout_sources"] = list(set(results["timedout_sources"])) - logger.info('Following sources got timed out:' + ','.join(results["timedout_sources"])) + logger.info('Following sources got timed out:' + ','.join(results["timedout_sources"])) session['total_search_results'] = total_results - session['displayed_search_results'] = displayed_results - - template_response = render_template('results.html', results=results, total_results=total_results, search_term=search_term) + session['displayed_search_results'] = displayed_results + + template_response = render_template('results.html', results=results, total_results=total_results, search_term=search_term) logger.info('search server call completed - after render call') return template_response @@ -159,10 +175,10 @@ def load_more_publications(): total_search_results_publications = session['total_search_results']['publications'] displayed_search_results_publications = session['displayed_search_results']['publications'] - number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) results['publications'] = results['publications'][displayed_search_results_publications:displayed_search_results_publications+number_of_records_to_append_on_lazy_load] session['displayed_search_results']['publications'] = displayed_search_results_publications+number_of_records_to_append_on_lazy_load - return render_template('components/publications.html', results=results) + return render_template('components/publications.html', results=results) @app.route('/load-more-researchers', methods=['GET']) def load_more_researchers(): @@ -174,30 +190,46 @@ def load_more_researchers(): total_search_results_researchers = session['total_search_results']['researchers'] displayed_search_results_researchers = session['displayed_search_results']['researchers'] - number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) results['researchers'] = results['researchers'][displayed_search_results_researchers:displayed_search_results_researchers+number_of_records_to_append_on_lazy_load] session['displayed_search_results']['researchers'] = displayed_search_results_researchers+number_of_records_to_append_on_lazy_load - return render_template('components/researchers.html', results=results) + return render_template('components/researchers.html', results=results) + +@app.route('/load-more-resources', methods=['GET']) +def load_more_resources(): + print('load more resources') + + #define a new results dict for resources to take new resources from the search results stored in the session + results = {} + results['resources'] = session['search-results']['resources'] + + total_search_results_resources = session['total_search_results']['resources'] + displayed_search_results_resources = session['displayed_search_results']['resources'] + number_of_records_to_append_on_lazy_load = int(utils.config["number_of_records_to_append_on_lazy_load"]) + results['resources'] = results['resources'][displayed_search_results_resources:displayed_search_results_resources+number_of_records_to_append_on_lazy_load] + session['displayed_search_results']['resources'] = displayed_search_results_resources+number_of_records_to_append_on_lazy_load + return render_template('components/resources.html', results=results) @app.route('/are-embeddings-generated', methods=['GET']) def are_embeddings_generated(): #Check the embeddings readiness only if the chatbot feature is enabled otherwise return False if (utils.config['chatbot_feature_enable']): - print('are_embeddings_generated') - uuid = session['search_uuid'] - chatbot_server = utils.config['chatbot_server'] - are_embeddings_generated = utils.config['endpoint_are_embeddings_generated'] - request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}" - headers = { - 'Content-Type': 'application/json' - } - response = requests.request("GET", request_url, headers=headers) - json_response = response.json() - print('json_response:', json_response) - return str(json_response['file_exists']) - else: - return str(True) + if (utils.config['chatbot_feature_enable']): + print('are_embeddings_generated') + uuid = session['search_uuid'] + chatbot_server = utils.config['chatbot_server'] + are_embeddings_generated = utils.config['endpoint_are_embeddings_generated'] + request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}" + headers = { + 'Content-Type': 'application/json' + } + response = requests.request("GET", request_url, headers=headers) + json_response = response.json() + print('json_response:', json_response) + return str(json_response['file_exists']) + else: + return str(True) @app.route('/get-chatbot-answer', methods=['GET']) def get_chatbot_answer(): @@ -211,7 +243,7 @@ def get_chatbot_answer(): search_uuid = session['search_uuid'] answer = chatbot.getAnswer(question=question, search_uuid=search_uuid) - + return answer @@ -236,8 +268,11 @@ def format_digital_obj_url(value): for source in value.source: source_dict = {} source_dict['doi'] = value.identifier - source_dict['sname'] = source.name - source_dict['sid'] = source.identifier + if isinstance(source, str): + source_dict['sname'] = source + else: + source_dict['sname'] = source.name + source_dict['sid'] = source.identifier sources_list.append(source_dict) return json.dumps(sources_list) FILTERS["format_digital_obj_url"] = format_digital_obj_url @@ -245,7 +280,7 @@ def format_digital_obj_url(value): def format_authors_for_citations(value): authors = "" for author in value: - authors += (author.name + " and ") + authors += (author.name + " and ") return authors.rstrip(' and ') + "." FILTERS["format_authors_for_citations"] = format_authors_for_citations @@ -255,6 +290,12 @@ def regex_replace(s, find, replace): return re.sub(find, replace, s) FILTERS["regex_replace"] = regex_replace +import re +def regex_replace(s, find, replace): + """A non-optimal implementation of a regex filter""" + return re.sub(find, replace, s) +FILTERS["regex_replace"] = regex_replace + from urllib.parse import unquote import ast @@ -263,10 +304,10 @@ def regex_replace(s, find, replace): def publication_details(sources): sources = unquote(sources) - sources = ast.literal_eval(sources) + sources = ast.literal_eval(sources) for source in sources: doi = source['doi'] - + publication = openalex_publications.get_publication(doi="https://doi.org/"+doi) response = make_response(render_template('publication-details.html', publication=publication)) @@ -276,8 +317,8 @@ def publication_details(sources): @app.route('/publication-details-references/', methods=['GET']) @utils.timeit def publication_details_references(doi): - print("doi:", doi) - + print("doi:", doi) + publication = crossref.get_publication(doi=doi) response = make_response(render_template('partials/publication-details/references.html', publication=publication)) @@ -287,7 +328,7 @@ def publication_details_references(doi): @app.route('/publication-details-recommendations/', methods=['GET']) @utils.timeit def publication_details_recommendations(doi): - print("DOI:", doi) + print("DOI:", doi) publications = semanticscholar.get_recommendations_for_publication(doi=doi) response = make_response(render_template('partials/publication-details/recommendations.html', publications=publications)) print("response:", response) @@ -296,34 +337,62 @@ def publication_details_recommendations(doi): @app.route('/publication-details-citations/', methods=['GET']) @utils.timeit def publication_details_citations(doi): - print("DOI:", doi) + print("DOI:", doi) publications = semanticscholar.get_citations_for_publication(doi=doi) response = make_response(render_template('partials/publication-details/citations.html', publications=publications)) print("response:", response) return response -@app.route('/resource-details') -def resource_details(): - response = make_response(render_template('resource-details.html')) +@app.route('/resource-details/', methods=['GET']) +def resource_details(sources): - # Set search-session cookie to the session cookie value of the first visit - if request.cookies.get('search-session') is None: - if request.cookies.get('session') is None: - response.set_cookie('search-session', str(uuid.uuid4())) - else: - response.set_cookie('search-session', request.cookies['session']) + sources = unquote(sources) + sources = ast.literal_eval(sources) + for source in sources: + doi = source['doi'] + resource = zenodo.get_resource(doi="https://doi.org/"+doi) + response = make_response(render_template('resource-details.html', resource=resource)) + print("response:", response) return response +@app.route('/resource-details-citations/', methods=['GET']) +@utils.timeit +def resource_details_citations(doi): + print("DOI:", doi) + resource = semanticscholar.get_citations_for_publication(doi=doi) + response = make_response(render_template('partials/publication-details/citations.html', resource=resource)) + print("response:", response) + return response + +@app.route('/resource-details-references/', methods=['GET']) +@utils.timeit +def resource_details_references(doi): + print("doi:", doi) + + resource = crossref.get_publication(doi=doi) + response = make_response(render_template('partials/publication-details/references.html', resource=resource)) + + print("response:", response) + return response + +@app.route('/resource-details-recommendations/', methods=['GET']) +@utils.timeit +def resource_details_recommendations(doi): + print("DOI:", doi) + publications = semanticscholar.get_recommendations_for_publication(doi=doi) + response = make_response(render_template('partials/publication-details/recommendations.html', publications=publications)) + print("response:", response) + return response @app.route('/researcher-details/', methods=['GET']) def researcher_details(index): - # index = json.loads(index) - # for result in results['researchers']: - # if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']: - # researcher = result - # break - # logger.info(f'Found researcher {researcher}') + index = json.loads(index) + for result in results['researchers']: + if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']: + researcher = result + break + logger.info(f'Found researcher {researcher}') researcher = openalex_researchers.get_researcher_details(index) response = make_response(render_template('researcher-details.html',researcher=researcher)) @@ -349,7 +418,6 @@ def researcher_banner(index): return jsonify() return jsonify(imageUrl = f'data:image/jpeg;base64,{researcher.banner}') - @app.route('/organization-details//', methods=['GET']) def organization_details(organization_id, organization_name): try: diff --git a/objects.py b/objects.py index 8669243..571d2f8 100644 --- a/objects.py +++ b/objects.py @@ -1,4 +1,4 @@ -from typing import Union, List +from typing import Union, List, Dict import dataclasses from dataclasses import dataclass, fields, field @dataclass @@ -8,7 +8,7 @@ class thing: description: str = "" url: str = "" image: str = "" #url of the image - identifier: str = "" #doi or pid will be stored as identifier + identifier: str = "" #doi or pid will be stored as identifier originalSource: str = "" source: list() = field(default_factory=list) # this list will have "thing" objects rankScore: float = 0 #bm25 ranking score for sorting the search results @@ -18,7 +18,7 @@ def __str__(self): strValue = "" for field in fields(self): # print(field.type) - # concatenate all the property values + # concatenate all the property values strValue += f"{getattr(self, field.name)}###" return strValue @@ -54,12 +54,29 @@ class Person(thing): nationality: str = "" # we can later link it to country #this should be a list workLocation: str = "" #this should be a list worksFor: Organization = None #this should be a list - -Organization.founder = List[Person] + +Organization.founder = List[Person] # Organization.funder = Union[Organization(), Person()] Organization.parentOrganization = Organization() +@dataclass +class Author(Person): + # orcid: str = "" # we should not have this attribute; orcid should be kept in + works_count: str = "" + cited_by_count: str = "" + +@dataclass +class Statistics(thing): + downloads: str = "" + unique_downloads: str = "" + views: str = "" + unique_views: str = "" + version_downloads: str = "" + version_unique_downloads: str = "" + version_unique_views: str = "" + version_views: str = "" + @dataclass class CreativeWork(thing): abstract: str = "" @@ -67,13 +84,15 @@ class CreativeWork(thing): author: List[Union[Organization, Person]] = field(default_factory=list) citation: list() = field(default_factory=list) # this list will have "CreativeWork" objects countryOfOrigin: str = "" + conditionsOfAccess: str = "" + contributor: List[Union[Organization, Person]] = field(default_factory=list) creativeWorkStatus: str = "" dateCreated: str = "" dateModified: str = "" datePublished: str = "" - encoding_contentUrl: str = "" + encoding_contentUrl: Dict[str, str] = field(default_factory=dict) encodingFormat: str = "" - funder: Union[Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 + funder: Union[Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 funding: str = "" # we can change this to Grant genre: str = "" headline: str = "" @@ -87,23 +106,39 @@ class CreativeWork(thing): text: str = "" thumbnail: str = "" #ImageObject thumbnailUrl: str = "" #url - version: str = "" + version: str = "" + stats: Statistics = None + cites: List[Union[str, str]] = field(default_factory=list) + isPartOf: List[Union[str, str]] = field(default_factory=list) + isSupplementTo : List[Union[str, str]] = field(default_factory=list) + isSourceOf : List[Union[str, str]] = field(default_factory=list) + isCitedBy : List[Union[str, str]] = field(default_factory=list) + hasPart: List[Union[str, str]] = field(default_factory=list) + isSupplementedBy: List[Union[str, str]] = field(default_factory=list) + isPreviousVersionOf: List[Union[str, str]] = field(default_factory=list) + isDerivedFrom: List[Union[str, str]] = field(default_factory=list) + documents: List[Union[str, str]] = field(default_factory=list) + + @dataclass -class Article(CreativeWork): +class Article(CreativeWork): articleBody: str = "" pageEnd: str = "" pageStart: str = "" pagination: str = "" wordCount: str = "" + issue: str = "" + Journal: str = "" + JournalVolume: str = "" @dataclass -class Dataset(CreativeWork): +class Dataset(CreativeWork): distribution: str = "" issn: str = "" @dataclass class Author(Person): - orcid: str = "" # we should not have this attribute; orcid should be kept in + orcid: str = "" # we should not have this attribute; orcid should be kept in works_count: str = "" about: str = "" banner: str = "" @@ -114,7 +149,7 @@ class Author(Person): #The 'Project' is a new addition to schema.org, and as of now, there are no defined properties for it @dataclass -class Project(Organization): +class Project(Organization): dateStart: str = "" dateEnd: str = "" dateLastModified : str = "" @@ -131,8 +166,9 @@ class Project(Organization): class SoftwareApplication(CreativeWork): distribution: str = "" issn: str = "" + softwareVersion: str = "" @dataclass -class LearningResource(CreativeWork): +class LearningResource(CreativeWork): assesses: str = "" #The item being described is intended to assess the competency or learning outcome defined by the referenced term. competencyRequired: str = "" educationalAlignment:str = "" @@ -142,7 +178,7 @@ class LearningResource(CreativeWork): teaches:str = "" #The item being described is intended to help a person learn the competency or learning outcome defined by the referenced term. @dataclass -class MediaObject(CreativeWork): +class MediaObject(CreativeWork): associatedArticle: str = "" bitrate: str = "" contentSize: str = "" @@ -162,9 +198,9 @@ class MediaObject(CreativeWork): startTime: str = "" uploadDate: str = "" width: str = "" - + @dataclass -class VideoObject(MediaObject): +class VideoObject(MediaObject): actor: str = "" caption: str = "" director: str = "" @@ -174,21 +210,21 @@ class VideoObject(MediaObject): videoFrameSize: str = "" videoQuality: str = "" @dataclass -class ImageObject(MediaObject): +class ImageObject(MediaObject): caption: str = "" embeddedTextCaption: str = "" exifData: str = "" #exif data for this object representativeOfPage: str = "" #Indicates whether this image is representative of the content of the page @dataclass -class Place(thing): +class Place(thing): additionalProperty: str = "" address: str = "" addressType: str = "" aggregateRating: str = "" amenityFeature: str = "" branchCode: str = "" - containedInPlace: str = "" + containedInPlace: str = "" containsPlace : str = "" event: str = "" faxNumber: str = "" @@ -203,17 +239,17 @@ class Place(thing): geoOverlaps: str = "" geoTouches: str = "" geoWithin: str = "" - globalLocationNumber: str = "" + globalLocationNumber: str = "" hasDriveThroughService: str = "" hasMap: str = "" - isAccessibleForFree: str = "" + isAccessibleForFree: str = "" isicV4: str = "" keywords: str = "" latitude: str = "" licence: str = "" logo: str = "" longitude: str = "" - maximumAttendeeCapacity: str = "" + maximumAttendeeCapacity: str = "" openingHoursSpecification: str = "" photo: str = "" placType: str = "" @@ -322,7 +358,7 @@ class Lesson: description: str date: str - + @dataclass class Publisher: id: str @@ -349,7 +385,7 @@ class Funder: class Gesis: resource_type: str url: str - date: str + date: str title: str description: str authors: str @@ -359,7 +395,7 @@ class Gesis: class Cordis: id: str url: str - date: str + date: str title: str description: str diff --git a/sources/data_retriever.py b/sources/data_retriever.py index 3c2f57c..28596cc 100644 --- a/sources/data_retriever.py +++ b/sources/data_retriever.py @@ -8,7 +8,7 @@ logger = logging.getLogger('nfdi_search_engine') def retrieve_data(source: str, base_url: str, search_term: str, results): - + try: search_term = urllib.parse.quote_plus(string=search_term, safe='()?&=,') @@ -23,8 +23,8 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): 'User-Agent': utils.config["request_header_user_agent"] } # print("url:", url) - - response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) logger.debug(f'{source} response status code: {response.status_code}') logger.debug(f'{source} response headers: {response.headers}') @@ -36,7 +36,7 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): #clean the json response; remove all the keys which don't have any value search_result = utils.clean_json(search_result) - return search_result + return search_result else: logger.error(f'Response status code: {str(response.status_code)}') @@ -47,8 +47,8 @@ def retrieve_data(source: str, base_url: str, search_term: str, results): raise ex def retrieve_single_object(source: str, base_url: str, doi: str): - - try: + + try: doi = urllib.parse.quote_plus(string=doi, safe='()?&=,') url = base_url + doi # print('url:', url) @@ -56,7 +56,7 @@ def retrieve_single_object(source: str, base_url: str, doi: str): 'Content-Type': 'application/json', 'User-Agent': utils.config["request_header_user_agent"] } - response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) logger.debug(f'{source} response status code: {response.status_code}') # logger.debug(f'{source} response headers: {response.headers}') @@ -68,7 +68,7 @@ def retrieve_single_object(source: str, base_url: str, doi: str): #clean the json response; remove all the keys which don't have any value search_result = utils.clean_json(search_result) - return search_result + return search_result else: logger.error(f'Response status code: {str(response.status_code)}') diff --git a/sources/dblp.py b/sources/dblp.py new file mode 100644 index 0000000..0562ac6 --- /dev/null +++ b/sources/dblp.py @@ -0,0 +1,126 @@ +import extruct +import requests +from objects import Person, Article +import logging +import os +import pprint +import utils +# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) +logger = logging.getLogger('nfdi_search_engine') + + +def extract_metadata(text: bytes) -> object: + """Extract all metadata present in the page and return a dictionary of metadata lists. + + Args: + text: The content of a requests.get( ) call + + Returns: + metadata (dict): Dictionary of json-ld, microdata, and opengraph lists. + Each of the lists present within the dictionary contains multiple dictionaries. + """ + metadata = extruct.extract(text, + uniform=True, + syntaxes=['json-ld', + 'microdata', + 'opengraph']) + assert isinstance(metadata, object) + return metadata + + +@utils.timeit +# def dblp(search_term: str, g, results): +def search(search_term: str, results): + + try: + + base_url = utils.config["search_url_dblp"] + url = base_url + search_term + + headers = {'Accept': 'application/json', + 'Content-Type': 'application/json', + 'User-Agent': utils.config["request_header_user_agent"] + } + response = requests.get(url, headers=headers, timeout=int(utils.config["request_timeout"])) + + logger.debug(f'DBLP response status code: {response.status_code}') + logger.debug(f'DBLP response headers: {response.headers}') + + # TODO unclear why here are only a few but now all results returned + + metadata = extract_metadata(response.content) + # TODO unclear why this loop takes so long + #The profiler indicates that the JSON-LD parsing process is responsible for the majority of the execution time, taking approximately 18.21 seconds. + # + # I.e. the JSON-LD parsing takes that long + for data in metadata['microdata']: + if data['@type'] == 'Person': + ''' + results.append( + Person( + name=data["name"], + url=data["url"], + affiliation="" + ) + ) + ''' + elif data['@type'] == 'ScholarlyArticle': + if 'author' in data: + url = '' + if 'url' in data: + if type(data["url"]) == list: + url = ', '.join(data["url"]) + else: + url = data["url"] + publication = Article() + publication.source = 'DBLP' + publication.name = data["name"] + publication.url = url + publication.image = data["image"] + publication.description = '' + publication.abstract = '' + publication.keywords.append('') + publication.inLanguage.append("") + publication.datePublished = data["datePublished"] + publication.license = '' + author = Person() + author.type = 'Person' + if type(data["author"]) == list: + #author = ', '.join([authors["name"] for authors in data["author"]]) + for authors in data["author"]: + author2 = Person() + author2.name = authors["name"] + author2.type = 'Person' + publication.author.append(author2) + elif type(data["author"]) == dict: + author.name = data["author"]["name"] + publication.author.append(author) + else: + author.name = data["author"] + publication.author.append(author) + publication.encoding_contentUrl = '' + publication.encodingFormat = '' + + results['publications'].append(publication) + ''' + results.append( + Article( + title=data["name"], + url=url, + authors=author, + description='', + date=data["datePublished"] + ) + ) + ''' + logger.info(f"Got {len(results)} Researchers and scholarly articls from DBLP") + # return results + # g.parse(data=json.dumps(data), format='json-ld') + # logger.info(f"Graph g has {len(g)} statements after querying DBLP.") + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('DBLP') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') \ No newline at end of file diff --git a/sources/eudat.py b/sources/eudat.py index 853ad1a..3536892 100644 --- a/sources/eudat.py +++ b/sources/eudat.py @@ -94,7 +94,7 @@ def search(search_term: str, results): _source.identifier = hit.get("id", "") # _source.url = hit.get('links', {}).get('self', '') # this gives json response _source.url = source_url_direct_access + _source.identifier - digitalObj.source.append(_source) + digitalObj.source = "EUDAT" if resource_type in ['DATASET', 'MODEL', 'AUDIOVISUAL']: results['resources'].append(digitalObj) diff --git a/sources/eulg.py b/sources/eulg.py index 7a397b2..edc4669 100644 --- a/sources/eulg.py +++ b/sources/eulg.py @@ -66,7 +66,11 @@ def _get(self, path: str, queries: List[set] = [], json: bool = False): dataset.name = result.resource_name dataset.url = url dataset.datePublished = str(result.creation_date) + dataset.dateModified = str(result.last_date_updated) dataset.description = description + dataset.version = result.version + dataset.encoding_contentUrl = result.detail + # dataset.conditionsOfAccess = result.condition_of_use[0] keywords = result.keywords if isinstance(keywords, list): for keyword in keywords: @@ -115,6 +119,9 @@ def _get(self, path: str, queries: List[set] = [], json: bool = False): software.name = result.resource_name software.url = url software.description = description + # software.version = result.version + software.encoding_contentUrl = result.detail + software.conditionsOfAccess - result.condition_of_use[0] software.datePublished = str(result.creation_date) software.countryOfOrigin = result.country_of_registration keywords = result.keywords diff --git a/sources/gesis.py b/sources/gesis.py index 22bb5e6..c585043 100644 --- a/sources/gesis.py +++ b/sources/gesis.py @@ -54,14 +54,12 @@ def search(search_term, results): resources.datePublished = date_published # publisher = dc_fields['publisher']['all'][0] if 'publisher' in dc_fields and 'all' in dc_fields['publisher'] else None # resources.publisher=publisher + + rights = dc_fields['rights']['all'][0] if 'rights' in dc_fields and 'all' in dc_fields['rights'] else None + resources.license = rights - # rights = dc_fields['rights']['all'][0] if 'rights' in dc_fields and 'all' in dc_fields['rights'] else None - # resources.license = rights - - languages = dc_fields['language']['all'] if 'language' in dc_fields and 'all' in dc_fields['language'] else '' - if languages: - for language in languages: - resources.inLanguage.append(language) + languages = dc_fields.get('language', {}).get('all', []) + resources.inLanguage.extend(languages) id = hit['_id'] id = id.replace('.', '-') diff --git a/sources/ieee.py b/sources/ieee.py index d0dc87f..2b5667a 100644 --- a/sources/ieee.py +++ b/sources/ieee.py @@ -65,7 +65,7 @@ def search(search_term, results): _source.name = source _source.identifier = hit.get("article_number", "") _source.url = hit.get("html_url", "") - publication.source.append(_source) + publication.source.append(_source) results['publications'].append(publication) diff --git a/sources/openaire.py b/sources/openaire.py new file mode 100644 index 0000000..1fd62c3 --- /dev/null +++ b/sources/openaire.py @@ -0,0 +1,181 @@ +import requests +import utils +from objects import Dataset, Author, Article, CreativeWork, Organization, Project +import logging + +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_string: str, results): + """ Obtain the results from Openaire request and handles them accordingly. + + Args: + search_string: keyword(s) to search for + results: search answer formatted into different data types according to Openaire result_types + and mapped to schema.org types. + + Returns: + the results Object + """ + openaire_product_search(search_string, results) + openaire_project_search(search_string, results) + + logger.info(f"Got {len(results)} records from Openaire") + return results + + +def openaire_product_search(search_string, results): + + try: + api_url = 'https://api.openaire.eu/search/researchProducts' + response = requests.get(api_url, + params={"keywords": search_string, "format": "json", "size": 20}, + timeout=int(utils.config["request_timeout"])) + data = response.json() + logger.debug(f'Openaire product search response status code: {response.status_code}') + logger.debug(f'Openaire product search response headers: {response.headers}') + + # hits = data.get('response', {}).get('results', {}).get('result', []) + if response.status_code == 200: + try: + hits = data.get('response', {}).get('results', {}).get('result', []) + except AttributeError: + hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError + + for hit in hits: + pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:result', {}) + result_type = pro_result.get('resulttype', {}).get('@classid', 'other') + # check result type to create an Object of the right Class + if result_type == 'publication': + product = Article() + elif result_type == 'dataset': + product = Dataset() + else: + product = CreativeWork() + + product.source = 'Openaire' + collectedfrom = pro_result.get('collectedfrom', None) + if collectedfrom: + product.originalSource = collectedfrom.get('@name', None) + + product.genre = result_type + date = pro_result.get('dateofacceptance', None) + if date: + product.datePublished = date['$'] + + # title can be dict or list. If list, there are 'main title' and 'alternate title' + if type(pro_result.get('title')) is dict: + product.name = pro_result.get('title', {}).get('$', '') + elif type(pro_result.get('title')) is list: + for item in pro_result.get('title'): + if item['@classid'] == 'main title': + product.name = item['$'] + + # description can be dict or list + if type(pro_result.get('description')) is dict: + product.description = utils.remove_html_tags(pro_result.get('description', {}).get('$', '')) + elif type(pro_result.get('description')) is list: + product.description = utils.remove_html_tags(pro_result.get('description')[0].get('$', '')) + else: + product.description = '' + + # Language can be set or "und" = Undetermined + product.inLanguage = [] if pro_result.get('language', {}).get('@classid', '') == 'und' else [pro_result.get( + 'language', {}).get('@classid', '')] + + # pid can be dict or list + if type(pro_result.get('pid')) is dict: + product.identifier = pro_result.get('pid', {}).get('$', '') + elif type(pro_result.get('pid')) is list: + product.identifier = pro_result.get('pid', {})[0].get('$', '') + else: + product.identifier = '' + + # Creators can be dict, list, None + # creators = pro_result.get('creator', {}) if pro_result.get('creator') is not None else {} + creators = pro_result.get('creator', None) + if type(creators) is dict: + creator = Author() + creator.type = 'Person' + creator.name = creators.get('$', '') + product.author.append(creator) + elif type(creators) is list: + for item in creators: + creator = Author() + creator.type = 'Person' + creator.name = item.get('$', '') + product.author.append(creator) + + # Check genre to add result to right category + if product.genre == 'publication': + results['publications'].append(product) + elif product.genre == 'dataset' or product.genre == 'software': + results['resources'].append(product) + else: + results['others'].append(product) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENAIRE') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def openaire_project_search(search_string, results): + + try: + api_url = 'https://api.openaire.eu/search/projects' + response = requests.get(api_url, + params={"name": search_string, "format": "json", "size": 20}, + timeout=int(utils.config["request_timeout"])) + data = response.json() + logger.debug(f'Openaire project search response status code: {response.status_code}') + logger.debug(f'Openaire project search response headers: {response.headers}') + + if response.status_code == 200: + try: + hits = data.get('response', {}).get('results', {}).get('result', []) + except AttributeError: + hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError + + for hit in hits: + pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:project', {}) + project = Project() + project.source = 'Openaire' + project.name = pro_result.get('title', {}).get('$', '') + project.dateStart = pro_result.get('startdate', {}).get('$', '') + project.dateEnd = pro_result.get('enddate', {}).get('$', '') + project.identifier = pro_result.get('callidentifier', {}).get('$', '') + + # fundingtree can be dict or list + # fundingtree = pro_result.get('fundingtree', {}) if pro_result.get('fundingtree') is not None else {} + fundingtree = pro_result.get('fundingtree', None) + if type(fundingtree) is dict: + orga = Organization() + orga.name = fundingtree.get('name', {}).get('$', '') + project.funder.append(orga) + elif type(fundingtree) is list: + for item in fundingtree: + orga = Organization() + orga.name = item.get('name', {}).get('$', '') + project.funder.append(orga) + + # "rels" can be None, dict, list + relations = pro_result.get('rels', {}).get('rel', {}) if pro_result.get('rels', {}) is not None else [] + if type(relations) is dict: + relations = [relations] + + # This need a review. Type 'Organization' ? + for rel in relations: + author_obj = Author() + author_obj.type = 'Organization' + author_obj.name = (rel.get('legalname', {}).get('$', '')) + project.author.append(author_obj) + results['others'].append(project) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENAIRE') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') \ No newline at end of file diff --git a/sources/openalex.py b/sources/openalex.py new file mode 100644 index 0000000..e1aede0 --- /dev/null +++ b/sources/openalex.py @@ -0,0 +1,215 @@ +import requests +import logging +from objects import Person, Author, Article, Institute, Funder, Publisher +import utils + +# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_key: str, results): + find_works(search_key, results) + find_authors(search_key, results) + # find_institute(search_key, results) + # find_funder(search_key, results) + # find_publisher(search_key, results) + logger.info(f"Got {len(results)} author, publication, and institute records from OpenAlex") + return results + + +def find_authors(search_key, results): + + try: + base_url = utils.config["search_url_openalex_authors"] + headers = {'Accept': 'application/json', + 'Content-Type': 'application/json', + 'User-Agent': utils.config["request_header_user_agent"] + } + response = requests.get(base_url + search_key, headers=headers, timeout=int(utils.config["request_timeout"])) + + if response.status_code == 200: + search_result = response.json() + + records_found = search_result['meta']['count'] + logger.info(f'OpenAlex Authors - {records_found} records found') + + authors = search_result.get('results', None) + if authors: + for author in authors: + authorObj = Author() + authorObj.source = 'OpenAlex' + authorObj.name = author.get('display_name', '') + authorObj.orcid = author.get('orcid', '') + + last_known_institution = author.get('last_known_institution', {}) + if last_known_institution: + authorObj.affiliation = author.get('last_known_institution', {}).get('display_name') + else: + authorObj.affiliation = '' + authorObj.works_count = author.get('works_count', '') + authorObj.cited_by_count = author.get('cited_by_count', '') + + results['researchers'].append(authorObj) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENALEX') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def find_works(search_key, results): + + try: + api_url = "https://api.openalex.org/works?search=" + api_response = requests.get(api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code != 404: + api_data = api_response.json() + for work in api_data['results']: + if 'id' in work: + if work["display_name"] is None \ + or work["id"] is None \ + or work["doi"] is None \ + or work["publication_date"] is None: + continue + publication = Article() + publication.source = 'OpenAlex' + publication.name = utils.remove_html_tags(work["display_name"]) + publication.url = work["doi"] + # publication.image = hit_source.get("image", "") + publication.description = '' + if not work["abstract_inverted_index"] is None: + publication.description = generate_string_from_keys(work["abstract_inverted_index"]) # Generate the string using keys from the dictionary + publication.abstract = '' + keywords = work["concepts"] + if keywords: + for keyword in keywords: + publication.keywords.append(keyword["display_name"]) + + publication.inLanguage.append(str(work["language"])) + publication.datePublished = str(work["publication_date"]) + publication.license = '' + if not work["primary_location"]["license"] is None: + publication.license = work["primary_location"]["license"] + + if len(work["authorships"]) == 1: + author = Person() + author.name = work["authorships"][0]["author"]["display_name"] + author.type = 'Person' + author.identifier = work["id"] + publication.author.append(author) + else: + # authorship = ', '.join( + # current_author["author"]["display_name"] for current_author in work["authorships"]) + for current_author in work["authorships"]: + author = Person() + author.name = current_author["author"]["display_name"] + author.type = 'Person' + author.identifier = current_author["author"]["orcid"] + publication.author.append(author) + + publication.encoding_contentUrl = '' + publication.encodingFormat = '' + + results['publications'].append(publication) + '''' + results.append( + Article( + title=work["display_name"], + url=work["id"], + authors=author, + description='', + date=str(work["publication_year"]) + ) + ) + ''' + + # logger.info(f'Got {len(results)} publication records from OpenAlex') + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('OPENALEX') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def find_institute(search_key, results): + institute_api_url = "https://api.openalex.org/institutions?search=" + api_response = requests.get(institute_api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code != 404: + api_data = api_response.json() + for institute in api_data["results"]: + if 'id' in institute: + institute_acronym = ', '.join( + inst_acronym for inst_acronym in institute["display_name_acronyms"]) + + description = '' + if 'wikipedia' in institute["ids"]: + # institute_wikipedia_link = institute["ids"]["wikipedia"] + description = utils.read_wikipedia(institute["display_name"]) + + institute_country = '' + if 'country' in institute["geo"]: + institute_country = institute["geo"]["country"] + results.append( + Institute( + id=institute["id"], + name=institute["display_name"], + country=institute_country, + institute_type=institute["type"], + acronyms_name=institute_acronym, + homepage_url=institute["homepage_url"], + description=description) + ) + # logger.info(f'Got {len(results)} institute records from OpenAlex') + + +def find_funder(search_key, results): + funder_api_url = "https://api.openalex.org/funders?search=" + api_response = requests.get(funder_api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code == 404: + return + api_data = api_response.json() + for funder in api_data["results"]: + if 'id' in funder: + results.append( + Funder( + id=funder["id"], + name=funder["display_name"], + homepage_url=funder["homepage_url"], + country_code=funder["country_code"], + grants_count=funder["grants_count"], + works_count=funder["works_count"], + description=funder["description"]) + ) + + +def find_publisher(search_key, results): + publisher_api_url = "https://api.openalex.org/publishers?search=" + api_response = requests.get(publisher_api_url + search_key, timeout=int(utils.config["request_timeout"])) + if api_response.status_code == 404: + return + api_data = api_response.json() + for publisher in api_data["results"]: + country_codes = ', '.join( + country_code for country_code in publisher["country_codes"]) + h_index = '' + if 'h_index' in publisher["summary_stats"]: + h_index = publisher["summary_stats"]["h_index"] + if 'id' in publisher: + results.append( + Publisher( + id=publisher["id"], + name=publisher["display_name"], + country_codes=country_codes, + works_count=publisher["works_count"], + homepage_url=publisher['homepage_url'], + h_index=h_index, + description='') + ) + + +def generate_string_from_keys(dictionary): + keys_list = list(dictionary.keys()) + keys_string = " ".join(keys_list) + return keys_string diff --git a/sources/openalex_researchers.py b/sources/openalex_researchers.py index 783a550..6682684 100644 --- a/sources/openalex_researchers.py +++ b/sources/openalex_researchers.py @@ -1,10 +1,14 @@ import requests from objects import thing, Article, Author, Organization +from objects import thing, Article, Author, Organization import logging import utils from sources import data_retriever import traceback from openai import OpenAI +import json +from openai import OpenAI + import json # logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) logger = logging.getLogger('nfdi_search_engine') @@ -16,22 +20,22 @@ def generate_string_from_keys(dictionary): @utils.timeit def search(search_term: str, results): - + source = "OPENALEX Researchers" try: - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=utils.config["search_url_openalex_researchers"], search_term=search_term, results=results) total_records_found = search_result['meta']['count'] hits = search_result.get("results", []) total_hits = len(hits) - logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') - if int(total_hits) > 0: + if int(total_hits) > 0: for hit in hits: - + author = Author() # info = hit.get('info',{}) author.orcid = hit.get("ids", {}).get("orcid", "") @@ -69,8 +73,8 @@ def search(search_term: str, results): if isinstance(topics, list): for topic in topics: name = topic.get('display_name', '') - author.researchAreas.append(name) - + author.researchAreas.append(name) + author.works_count = hit.get('works_count', '') author.cited_by_count = hit.get('cited_by_count', '') @@ -79,7 +83,7 @@ def search(search_term: str, results): _source.identifier = hit.get("ids", {}).get("openalex", "").replace('https://openalex.org/','') author.source.append(_source) - search_result_semantic = data_retriever.retrieve_data(source=source, + search_result_semantic = data_retriever.retrieve_data(source=source, base_url="https://api.semanticscholar.org/graph/v1/author/search?fields=name,url,externalIds,paperCount,citationCount&query=", search_term= author.name.replace(" ", "+"), results={}) @@ -92,17 +96,17 @@ def search(search_term: str, results): _source = thing() _source.name = 'SEMANITCSCHOLAR' _source.identifier = semanticId - _source.url = semantic_hit.get("url", "") + _source.url = semantic_hit.get("url", "") author.source.append(_source) break results['researchers'].append(author) - + except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append(source) - + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) @@ -122,11 +126,11 @@ def get_researcher_details(url): url = json.loads(url) try: - hit = data_retriever.retrieve_data(source=source, + hit = data_retriever.retrieve_data(source=source, base_url="https://api.openalex.org/authors/", search_term=url[0]['sid'], results={}) - + researcher = Author() researcher.url = json.dumps(url) researcher.orcid = hit.get("ids", {}).get("orcid", "") @@ -170,26 +174,26 @@ def get_researcher_details(url): _source.name = 'OPENALEX' _source.identifier = hit.get("ids", {}).get("openalex", "").replace('https://openalex.org/','') researcher.source.append(_source) - + ##### uncomment to search openalex for publications... - # search_result = data_retriever.retrieve_data(source=source, + # search_result = data_retriever.retrieve_data(source=source, # base_url="https://api.openalex.org/works?filter=author.id:", # search_term=researcher.source[0].identifier, # results={}) # total_records_found = search_result['meta']['count'] # hits = search_result.get("results", []) # total_hits = len(hits) - # logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') - # if int(total_hits) > 0: + # logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + # if int(total_hits) > 0: # for hit in hits: - - # publication = Article() - # publication.name = utils.remove_html_tags(hit.get("title", "")) + # publication = Article() + + # publication.name = utils.remove_html_tags(hit.get("title", "")) # publication.url = hit.get("id", "") # not a valid url, openalex is currently working on their web interface. # publication.identifier = hit.get("doi", "").replace("https://doi.org/", "") - # publication.datePublished = hit.get("publication_date", "") + # publication.datePublished = hit.get("publication_date", "") # publication.inLanguage.append(hit.get("language", "")) # publication.license = hit.get("primary_location", {}).get("license", "") # # publication.publication = hit.get("primary_location", {}).get("source", {}).get("display_name", "") @@ -198,7 +202,7 @@ def get_researcher_details(url): # publication.description = generate_string_from_keys(abstract_inverted_index) # Generate the string using keys from the dictionary # publication.abstract = publication.description - # authorships = hit.get("authorships", []) + # authorships = hit.get("authorships", []) # for authorship in authorships: # authors = authorship.get("author", {}) @@ -206,21 +210,21 @@ def get_researcher_details(url): # _author = Author() # _author.type = 'Person' # _author.name = authors.get("display_name", "") - # _author.identifier = authors.get("orcid", "") + # _author.identifier = authors.get("orcid", "") # publication.author.append(_author) # # getattr(publication, "source").clear() # _source = thing() # _source.name = 'OPENALEX' # _source.identifier = hit.get("id", "").replace("https://openalex.org/", "") # remove the base url and only keep the ID - # _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. + # _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. # publication.source.append(_source) # researcher.works.append(publication) # search semantic scholar... - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url="https://api.semanticscholar.org/graph/v1/author/search?fields=name,url,externalIds,paperCount,citationCount&query=", search_term= researcher.name.replace(" ", "+"), results={}) @@ -233,57 +237,110 @@ def get_researcher_details(url): _source = thing() _source.name = 'SEMANITCSCHOLAR' _source.identifier = semanticId - _source.url = hit.get("url", "") + _source.url = hit.get("url", "") researcher.source.append(_source) break - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=f'https://api.semanticscholar.org/graph/v1/author/{semanticId}/papers?fields=url,title,venue,year,authors,abstract', search_term= "", results={}) - + hits = search_result.get("data", []) a = 0 total_hits = len(hits) - if int(total_hits) > 0: + if int(total_hits) > 0: for hit in hits: - - publication = Article() - publication.name = utils.remove_html_tags(hit.get("title", "")) + publication = Article() + + publication.name = utils.remove_html_tags(hit.get("title", "")) publication.url = hit.get("url", "") publication.identifier = hit.get("title", "") publication.description = hit.get("abstract", "") # publication.identifier = hit.get("doi", "").replace("https://doi.org/", "") - publication.datePublished = hit.get("year", "") + publication.datePublished = hit.get("year", "") # publication.inLanguage.append(hit.get("language", "")) # publication.license = hit.get("primary_location", {}).get("license", "") # publication.publication = hit.get("primary_location", {}).get("source", {}).get("display_name", "") + # abstract_inverted_index = hit.get("abstract_inverted_index", {}) + # publication.description = generate_string_from_keys(abstract_inverted_index) # Generate the string using keys from the dictionary + # publication.abstract = publication.description # abstract_inverted_index = hit.get("abstract_inverted_index", {}) # publication.description = generate_string_from_keys(abstract_inverted_index) # Generate the string using keys from the dictionary # publication.abstract = publication.description - authorships = hit.get("authors", []) + authorships = hit.get("authors", []) for authorship in authorships: # authors = authorship.get("author", {}) + # authors = authorship.get("author", {}) _author = Author() _author.type = 'Person' _author.name = authorship.get("name", "") - # _author.identifier = authors.get("orcid", "") + # _author.identifier = authors.get("orcid", "") publication.author.append(_author) # getattr(publication, "source").clear() _source = thing() _source.name = 'SEMANTICSCHOLAR' # _source.identifier = hit.get("id", "").replace("https://openalex.org/", "") # remove the base url and only keep the ID - # _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. + # _source.url = hit.get("id", "") # not a valid url, openalex is currently working on thier web interface. publication.source.append(_source) researcher.works.append(publication) a+=1 + ### uncomment to generate about section + logger.info(f'Getting publications {a}') + details = vars(researcher) + # Convert the details into a string format + details_str = "\n".join(f"{key}: {convert_to_string(value)}" for key, value in details.items() if (value not in ("", [], {}, None) and key not in ("works", "source","orcid"))) + prompt = f"Generate a 2-3 line 'About' section for a researcher based on the following details:\n{details_str}" + client = OpenAI( + api_key=utils.env_config["OPENAI_API_KEY"], + ) + logger.info('sent message to openai') + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": f'{prompt}', + } + ], + model="gpt-3.5-turbo", + ) + about_section = response.choices[0].text.strip() + # researcher.about = chat_completion.choices[0].message.content.strip() + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) + + return researcher + +def get_researcher_banner(researcher: Author): + try: + details = vars(researcher) + details_str = "\n".join(f"{convert_to_string(value)}" for key, value in details.items() if (value not in ("", [], {}, None) and key in ("researchAreas"))) + prompt = f"A banner for researcher with following research areas:\n{researcher.about}" + client = OpenAI( + api_key=utils.env_config["OPENAI_API_KEY"], + ) + response = client.images.generate( + model="dall-e-2", + prompt=prompt, + size="512x512", + quality="standard", + response_format="b64_json", + n=1, + ) + researcher.banner = response.data[0].b64_json + + researcher.works.append(publication) + a+=1 + ### uncomment to generate about section logger.info(f'Getting publications {a}') details = vars(researcher) @@ -334,4 +391,6 @@ def get_researcher_banner(researcher: Author): logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) + return researcher + return researcher \ No newline at end of file diff --git a/sources/orcid.py b/sources/orcid.py index 9208d89..f4f5ca2 100644 --- a/sources/orcid.py +++ b/sources/orcid.py @@ -1,6 +1,6 @@ import requests import logging -from objects import Person, Author, thing, Organization +from objects import Person, Author import utils logger = logging.getLogger('nfdi_search_engine') @@ -9,10 +9,10 @@ def search(search_term: str, results): try: - + base_url = utils.config["search_url_orcid"] url = base_url + '"' + search_term.replace(' ', '+') + '"' - + headers = {'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': utils.config["request_header_user_agent"] @@ -41,7 +41,7 @@ def search(search_term: str, results): institution = author.get('institution-name', []) for inst in institution: - authorObj.affiliation.append(Organization(name=inst)) + authorObj.affiliation.append(Organization(name=inst)) authorObj.works_count = '' authorObj.cited_by_count = '' @@ -50,7 +50,7 @@ def search(search_term: str, results): except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append('ORCID') - + except Exception as ex: logger.error(f'Exception: {str(ex)}') @@ -79,11 +79,11 @@ def get_orcid_access_token(): else: print("Failed to obtain access token:", response.text) return None - + # Function to search for public information from ORCID def old_search(search_term, results): # It also can be used for retrieving further information, logging in, edite records etc - access_token = '45d5a287-de76-4a62-8ab9-1ffc046e7cde' + access_token = '45d5a287-de76-4a62-8ab9-1ffc046e7cde' headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', @@ -105,7 +105,7 @@ def old_search(search_term, results): # Check if the response contains any search results if 'result' in json_data and isinstance(json_data['result'], list) and json_data['result']: # Iterate through the first 10 results for now (can be changed) - for result in json_data['result'][:10]: + for result in json_data['result'][:10]: orcid_id = result['orcid-identifier']['path'] # Generate the URL to the person's public profile in ORCID @@ -118,7 +118,7 @@ def old_search(search_term, results): if response.status_code == 200: # Extract the JSON response json_data = response.json() - + # Extract the name information name_data = json_data.get('name', {}) given_names = name_data.get('given-names', {}).get('value', '') @@ -152,7 +152,7 @@ def old_search(search_term, results): external_identifier_type = external_identifier.get('external-id-type', '') external_identifier_value = external_identifier.get('external-id-value', '') external_identifier_values.append((external_identifier_type, external_identifier_value)) - + affiliations = json_data.get('employments', {}).get('employment-summary', []) if affiliations: for affiliation in affiliations: @@ -192,7 +192,7 @@ def old_search(search_term, results): else: print("Failed to search for public data:", response.text) - logger.info(f'Got {len(results)} records from Orcid') - + logger.info(f'Got {len(results)} records from Orcid') + except requests.exceptions.RequestException as e: print("An error occurred during the request:", str(e)) \ No newline at end of file diff --git a/sources/wikidata.py b/sources/wikidata.py new file mode 100644 index 0000000..54489aa --- /dev/null +++ b/sources/wikidata.py @@ -0,0 +1,171 @@ +import requests +import logging +from objects import Article, Author +from string import Template +from datetime import datetime +from dateutil import parser +import utils + +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_string: str, results): + """ Obtain the results from Wikidata request and handles them accordingly. + + Args: + search_string: keyword(s) to search for + results: search answer are formatted according to schema.org types Article, Author, ... + + Returns: + the results array + """ + wikidata_person_search(search_string, results) + wikidata_article_search(search_string, results) + + logger.info(f"Got {len(results['researchers'])} author and {len(results['publications'])} publication records from Wikidata") + return results + + +def wikidata_article_search(search_string: str, results): + try: + + url = 'https://query.wikidata.org/sparql' + headers = {'User-Agent': 'https://nfdi-search.nliwod.org/'} + query_template = Template(''' + SELECT DISTINCT ?item ?label ?date #(year(?date)as ?dateYear) + (group_concat(DISTINCT ?authorsName; separator=",") as ?authorsLabel) + (group_concat(DISTINCT ?authors2; separator=",") as ?authorsString) + WHERE + { + SERVICE wikibase:mwapi + { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:limit "once"; + wikibase:api "Generator"; + mwapi:generator "search"; + mwapi:gsrsearch "$search_string"; + mwapi:gsrlimit "150". + ?item wikibase:apiOutputItem mwapi:title. + } + ?item rdfs:label ?label. FILTER( LANG(?label)="en" ) + ?item wdt:P31/wdt:P279* wd:Q11826511. + ?item wdt:P577 ?date . + ?item wdt:P50 ?authors. + ?authors rdfs:label ?authorsName . FILTER( LANG(?authorsName)="en" ) + optional {?item wdt:P2093 ?authors2.} + } + GROUP BY ?item ?label ?date + #ORDER BY DESC(?dateYear) + ''') + + response = requests.get(url, + params={'format': 'json', 'query': query_template.substitute(search_string=search_string), + }, headers=headers, timeout=int(utils.config["request_timeout"])) + logger.debug(f'Wikidata article search response status code: {response.status_code}') + logger.debug(f'Wikidata article search response headers: {response.headers}') + + if response.status_code == 200: + data = response.json() + if data["results"]["bindings"]: + for result in data["results"]["bindings"]: + publication = Article() + publication.source = 'Wikidata' + publication.url = result['item'].get('value', "") + publication.name = result['label'].get('value', "") + date_obj = parser.parse(result.get('date', {}).get('value', "")) + date = datetime.strftime(date_obj, '%Y-%m-%d') + publication.datePublished = date # result.get('date', {}).get('value', "") + if result['authorsLabel'].get("value"): + authors_list = result['authorsLabel'].get("value", "").rstrip(",").split(",") + for item in authors_list: + author = Author() + author.name = item + author.type = 'Person' + publication.author.append(author) + if result['authorsString'].get("value"): + authors_list = result['authorsString'].get("value", "").rstrip(",").split(",") + for item in authors_list: + author = Author() + author.name = item + author.type = 'Person' + publication.author.append(author) + results['publications'].append(publication) + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('WIKIDATA') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + +def wikidata_person_search(search_string: str, results): + try: + url = 'https://query.wikidata.org/sparql' + headers = {'User-Agent': 'https://nfdi-search.nliwod.org/'} + query_template = Template(''' +SELECT DISTINCT ?item ?itemLabel ?orcid (SAMPLE(?employerLabel) as ?employerSampleLabel) ?nationalityLabel ?givenNameLabel ?familyNameLabel + WHERE + { + SERVICE wikibase:mwapi + { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:api "EntitySearch"; + + mwapi:search "$search_string"; + mwapi:language "en"; + mwapi:limit "150". + ?item wikibase:apiOutputItem mwapi:item. + } + #?item (wdt:P279*/wdt:P31) wd:Q482980 . + ?item wdt:P106 ?occ . + ?occ wdt:P279* wd:Q1650915 . + OPTIONAL {?item wdt:P496 ?orcid .} + OPTIONAL {?item wdt:P27 ?nationality.} + OPTIONAL {?item wdt:P735 ?givenName.} + OPTIONAL {?item wdt:P734 ?familyName.} + OPTIONAL { + ?item p:P108 ?st. + ?st ps:P108 ?employer. + ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) + ?st pq:P580 ?date. + MINUS {?st pq:P582 ?enddate.} + } + OPTIONAL {?item wdt:P108 ?employer. + ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) + } + + SERVICE wikibase:label { + bd:serviceParam wikibase:language "en" . + } + } +GROUP by ?item ?itemLabel ?orcid ?nationalityLabel ?givenNameLabel ?familyNameLabel + + ''') + + response = requests.get(url, + params={'format': 'json', 'query': query_template.substitute(search_string=search_string), + }, headers=headers, timeout=int(utils.config["request_timeout"])) + logger.debug(f'Wikidata person search response status code: {response.status_code}') + logger.debug(f'Wikidata person search response headers: {response.headers}') + + if response.status_code == 200: + data = response.json() + if data["results"]["bindings"]: + for result in data["results"]["bindings"]: + author = Author() + author.source = 'Wikidata' + author.url = result['item'].get('value', "") + author.name = result['itemLabel'].get('value', "") + author.givenName = result.get('givenNameLabel', {}).get('value', "") + author.familyName = result.get('familyNameLabel', {}).get('value', "") + author.affiliation = result.get('employerSampleLabel', {}).get('value', "") + author.nationality = result.get('nationalityLabel', {}).get('value', "") + author.orcid = result.get('orcid', {}).get('value', "") + + results['researchers'].append(author) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append('WIKIDATA') + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') diff --git a/sources/wikidata_researchers.py b/sources/wikidata_researchers.py index b83f90a..da1c0bc 100644 --- a/sources/wikidata_researchers.py +++ b/sources/wikidata_researchers.py @@ -13,7 +13,7 @@ @utils.timeit def search(search_term: str, results): - + source = "WIKIDATA Researchers" try: @@ -25,74 +25,89 @@ def search(search_term: str, results): SERVICE wikibase:mwapi { bd:serviceParam wikibase:endpoint "www.wikidata.org"; - wikibase:api "EntitySearch"; + wikibase:api "EntitySearch"; mwapi:search "$search_string"; mwapi:language "en"; mwapi:limit "150". ?item wikibase:apiOutputItem mwapi:item. } - + ?item wdt:P106 ?occ . ?occ wdt:P279* wd:Q1650915 . OPTIONAL {?item wdt:P496 ?orcid .} OPTIONAL {?item wdt:P27 ?nationality.} OPTIONAL {?item wdt:P735 ?givenName.} - OPTIONAL {?item wdt:P734 ?familyName.} + OPTIONAL {?item wdt:P734 ?familyName.} OPTIONAL { ?item p:P108 ?st. ?st ps:P108 ?employer. ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) ?st pq:P580 ?date. - MINUS {?st pq:P582 ?enddate.} + MINUS {?st pq:P582 ?enddate.} } - OPTIONAL {?item wdt:P108 ?employer. + OPTIONAL {?item wdt:P108 ?employer. ?employer rdfs:label ?employerLabel. FILTER( LANG(?employerLabel)="en" ) - } + } SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } - GROUP by ?item ?itemLabel ?orcid ?nationalityLabel ?givenNameLabel ?familyNameLabel - - ''') + GROUP by ?item ?itemLabel ?orcid ?nationalityLabel ?givenNameLabel ?familyNameLabel + + ''') query = query_template.substitute(search_string=search_term) - query = ' '.join(query.split()) + query = ' '.join(query.split()) - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=utils.config["search_url_wikidata"], search_term=query, results=results) - - hits = search_result.get("results", {}).get("bindings", []) + + hits = search_result.get("results", {}).get("bindings", []) total_hits = len(hits) - logger.info(f'{source} - {total_hits} hits found') + logger.info(f'{source} - {total_hits} hits found') - if int(total_hits) > 0: + if int(total_hits) > 0: for hit in hits: - - author = Author() - # info = hit.get('info',{}) - author.orcid = hit.get("orcid", {}).get("value", "") - author.name = hit.get('itemLabel', '').get('value', '') - affiliations = hit.get('employerSampleLabel', {}) - if isinstance(affiliations, dict): - author.affiliation.append(Organization(name = affiliations.get('value', {}))) - author.works_count = '' - author.cited_by_count = '' - - _source = thing() - _source.name = 'WIKIDATA' - _source.identifier = hit.get("ids", {}).get("openalex", "") - _source.url = hit.get("item", {}).get("value", "") - author.source.append(_source) - - results['researchers'].append(author) - + + # this block should be updated to researchers + + publication = Article() + + publication.name = hit.get("label", {}).get("value","") + publication.url = hit.get("item", {}).get("value","") + publication.identifier = "" #DOI is available for few; we need to update the sparql query to fetch this information + publication.datePublished = datetime.strftime(parser.parse(hit.get('date', {}).get('value', "")), '%Y-%m-%d') + + authorsLabels = hit.get("authorsLabel", {}).get("value","") + for authorsLabel in authorsLabels.rstrip(",").split(","): + _author = Author() + _author.type = 'Person' + _author.name = authorsLabel + _author.identifier = "" #ORCID is available for few; we need to update the sparql query to pull this information + publication.author.append(_author) + + authorsStrings = hit.get("authorsString", {}).get("value","") + for authorsString in authorsStrings.rstrip(",").split(","): + _author = Author() + _author.type = 'Person' + _author.name = authorsString + _author.identifier = "" + publication.author.append(_author) + + _source = thing() + _source.name = 'WIKIDATA' + _source.identifier = hit['item'].get('value', "").replace("http://www.wikidata.org/", "") # remove the base url and only keep the ID + _source.url = hit['item'].get('value', "") + publication.source.append(_source) + + results['publications'].append(publication) + except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append(source) - + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/sources/zenodo.py b/sources/zenodo.py index 591f3e4..f8f9310 100644 --- a/sources/zenodo.py +++ b/sources/zenodo.py @@ -1,7 +1,7 @@ import requests import utils # from objects import Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Person, LearningResource, CreativeWork, VideoObject, ImageObject -from objects import thing, Article, Author, CreativeWork, Dataset, SoftwareApplication, VideoObject, ImageObject, LearningResource +from objects import thing, Article, Statistics, Author, CreativeWork, Dataset, SoftwareApplication, VideoObject, ImageObject, LearningResource import logging from sources import data_retriever import traceback @@ -14,94 +14,233 @@ def search(search_term, results): source = "Zenodo" try: - search_result = data_retriever.retrieve_data(source=source, + search_result = data_retriever.retrieve_data(source=source, base_url=utils.config["search_url_zenodo"], search_term=search_term, - results=results) + results=results) total_records_found = search_result.get("hits", {}).get("total", 0) hits = search_result.get("hits", {}).get("hits", []) total_hits = len(hits) - logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') if int(total_hits) > 0: for hit in hits: - + metadata = hit.get('metadata', {}) resource_type = metadata.get('resource_type', {}).get('type','OTHER').upper() if resource_type == 'PUBLICATION': - digitalObj = Article() + digitalObj = Article() elif resource_type in ['PRESENTATION', 'POSTER']: - digitalObj = CreativeWork() + digitalObj = CreativeWork() elif resource_type == 'DATASET': - digitalObj = Dataset() + digitalObj = Dataset() elif resource_type == 'VIDEO': - digitalObj = VideoObject() + digitalObj = VideoObject() elif resource_type == 'IMAGE': - digitalObj = ImageObject() + digitalObj = ImageObject() elif resource_type == 'LESSON': - digitalObj = LearningResource() + digitalObj = LearningResource() elif resource_type == 'SOFTWARE': - digitalObj = SoftwareApplication() + digitalObj = SoftwareApplication() elif resource_type == 'OTHER': - digitalObj = CreativeWork() + digitalObj = CreativeWork() else: print('This resource type is still not defined:', resource_type) digitalObj = CreativeWork() - + digitalObj.identifier = hit.get('doi', '') digitalObj.name = hit.get('title', '') - digitalObj.url = hit.get('links', {}).get('self', '') - + digitalObj.url = hit.get('links', {}).get('self', '') + digitalObj.genre = resource_type digitalObj.description = utils.remove_html_tags(metadata.get('description', '')) - + keywords = metadata.get('keywords', []) if isinstance(keywords, list): for keyword in keywords: - digitalObj.keywords.append(keyword) - + terms = [term.strip() for term in keyword.split(",")] + digitalObj.keywords.extend(terms) + language = metadata.get('language', '') digitalObj.inLanguage.append(language) + digitalObj.dateCreated = hit.get('created','') + digitalObj.dateModified = hit.get('modified','') + digitalObj.datePublished = metadata.get('resource_date', '') + digitalObj.license = metadata.get('license', {}).get('id', '') + digitalObj.creativeWorkStatus = hit.get('status','') + digitalObj.funder = metadata.get('grants', [{}])[0].get('funder', {}).get('name', '') + digitalObj.conditionsOfAccess = metadata.get('access-rights','') + if(digitalObj.conditionsOfAccess == ''): + digitalObj.conditionsOfAccess = metadata.get('access_right','') + + relation_map = { + 'iscitedby': 'isCitedBy', + 'issupplementto': 'isSupplementTo', + 'ispartof': 'isPartOf', + 'cites': 'cites', + 'issourceof': 'isSourceOf', + 'isderivedfrom': 'isDerivedFrom', + 'issupplementedby': 'isSupplementedBy', + 'ispreviousversionof': 'isPreviousVersionOf', + 'documents': 'documents', + 'haspart': 'hasPart' + } + + related_identifiers = metadata.get('related_identifiers', []) + + for related_identifier in related_identifiers: + relation = related_identifier.get('relation', '').lower() + identifier = related_identifier.get('identifier', '') - digitalObj.datePublished = metadata.get('publication_date', '') - digitalObj.license = metadata.get('license', {}).get('id', '') - - + if relation == 'iscitedby': + digitalObj.isCitedBy.append(identifier) + elif relation == 'issupplementto': + digitalObj.isSupplementTo.append(identifier) + elif relation == 'ispartof': + digitalObj.isPartOf.append(identifier) + elif relation == 'cites': + digitalObj.cites.append(identifier) + elif relation == 'issourceof': + digitalObj.isSourceOf.append(identifier) + elif relation == 'isderivedfrom': + digitalObj.isDerivedFrom.append(identifier) + elif relation == 'issupplementedby': + digitalObj.isSupplementedBy.append(identifier) + elif relation == 'ispreviousversionof': + digitalObj.isPreviousVersionOf.append(identifier) + elif relation == 'documents': + digitalObj.documents.append(identifier) + elif relation == 'haspart': + digitalObj.hasPart.append(identifier) - authors = metadata.get("creators", []) + authors = metadata.get("creators", []) for author in authors: _author = Author() _author.type = 'Person' _author.name = author.get("name", "") _author.identifier = author.get("orcid", "") _author.affiliation = author.get("affiliation", "") - digitalObj.author.append(_author) + digitalObj.author.append(_author) + + Stats = hit.get('stats', '') + _stats = Statistics() + + _stats.downloads = Stats.get("downloads", '') + _stats.unique_downloads = Stats.get("unique_downloads", '') + _stats.views = Stats.get("views", '') + _stats.unique_views = Stats.get("unique_views", '') + _stats.version_downloads = Stats.get("version_downloads", '') + _stats.version_unique_downloads = Stats.get("version_unique_downloads", '') + _stats.version_unique_views = Stats.get("version_unique_views", '') + _stats.version_views = Stats.get("version_views", '') + + digitalObj.stats = _stats + + contributors = metadata.get("contributors", []) + for contributor in contributors: + _contributor = Author() + _contributor.type = 'Person' + _contributor.name = contributor.get("name", "") + _contributor.identifier = contributor.get("orcid", "") + _contributor.affiliation = contributor.get("affiliation", "") + digitalObj.contributor.append(_contributor) _source = thing() _source.name = source _source.identifier = hit.get("id", "") - _source.url = hit.get('links', {}).get('self_html', '') - digitalObj.source.append(_source) + _source.url = hit.get('links', {}).get('self_html', '') + digitalObj.source.append(_source) + files = hit.get('files', []) + + # if resource_type == "LESSON": + for file in files: + file_key = file.get("key", "") + digitalObj.encoding_contentUrl[file_key] = file.get("links", {}).get("self", "") + + digitalObj.softwareVersion = metadata.get("version", "") if resource_type.upper() == 'PUBLICATION': digitalObj.abstract = digitalObj.description + pages = hit.get("journal", {}).get('pages', '') + if '-' in pages: + a, b = pages.split('-') + digitalObj.pageStart = a.strip() + digitalObj.pageEnd = b.strip() + else: + digitalObj.pageStart = pages + digitalObj.pageEnd = '' - files = hit.get('files', []) - for file in files: - if file.get("key", "").endswith(".pdf"): - digitalObj.encoding_contentUrl = file.get("links", {}).get("self", "") + digitalObj.pagination = pages - results['publications'].append(digitalObj) - elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: - results['resources'].append(digitalObj) + journal_info = metadata.get('journal', {}) + digitalObj.Journal = journal_info.get('title', '') + digitalObj.JournalVolume = journal_info.get('volume', '') + digitalObj.issue = journal_info.get('issue', '') + + results['publication'].append(digitalObj) + elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: + results['resources'].append(digitalObj) else: - results['others'].append(digitalObj) + results['others'].append(digitalObj) except requests.exceptions.Timeout as ex: logger.error(f'Timed out Exception: {str(ex)}') results['timedout_sources'].append(source) - + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) + + +@utils.timeit +def get_resource(doi: str): + + source = "Zenodo" + start_index = doi.find("zenodo.") + len("zenodo.") + if start_index!= -1: + doi = doi[start_index:] + else: + doi = doi + + try: + search_results = data_retriever.retrieve_single_object(source=source, + base_url=utils.config["search_url_zenodo"], + doi = doi) + search_result = search_results.get("hits", {}).get("hits", []) + search_result = search_result[0] + metadata = search_result.get('metadata', {}) + resource = CreativeWork() + resource.name = search_result.get("title", "") + resource.url = search_result.get('links', {}).get('self', '') + resource.identifier = search_result.get("doi", "") + resource.datePublished = metadata.get("publication_date", "") + resource.inLanguage.append(metadata.get("language", "")) + resource.license = metadata.get("license", "") + files = search_result.get('files','') + resource.encoding_contentUrl = {file["key"]: file["links"]["self"] for file in files} + resource.description = utils.remove_html_tags(metadata.get("description", "")) + resource.abstract = resource.description + authors = metadata.get("creators", []) + for author in authors: + _author = Author() + _author.type = 'Person' + _author.name = author.get("name", "") + _author.identifier = author.get("orcid", "") + _author.affiliation = author.get("affiliation", "") + resource.author.append(_author) + + keywords = metadata.get('keywords', []) + if isinstance(keywords, list): + for keyword in keywords: + terms = [term.strip() for term in keyword.split(",")] + resource.keywords.extend(terms) + + return resource + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + except Exception as ex: logger.error(f'Exception: {str(ex)}') logger.error(traceback.format_exc()) \ No newline at end of file diff --git a/templates/components/resources.html b/templates/components/resources.html index d3142dc..9523f75 100644 --- a/templates/components/resources.html +++ b/templates/components/resources.html @@ -13,20 +13,28 @@
-
- - {{resources.name}} +
+ + {{ resources.name }} + + DOI:{{ resources.identifier }}
+ {% set author_count = namespace(value=0) %} {% for author in resources.author %} + {% if author_count.value == 5 %} + and {{ (resources.author|count) - 5 }} + more + {% endif %} {% if author.type == 'Person' %} {{author.name}} {% endif %} + {% set author_count.value = author_count.value + 1 %} {% endfor %}
@@ -37,34 +45,49 @@
- {{resources.source}} - {% for language in resources.inLanguage %} - {{language|upper}} + {% set found = false %} + + {% if resources.source.name %} + + {{ resources.source.name }} + + {% set found = true %} + {% endif %} + + {% if not found %} + + + {{ resources.source }} + + + {% endif %} + {% if resources.source.inLanguage %} + {% for language in resources.source.inLanguage %} + {{ language|upper }} + {% endfor %} + {% endif %} + + {{ resources.license }} + {{ resources.encodingFormat|upper }} +
+ {% for keyword in resources.keywords %} + {{ keyword }} {% endfor %} - {{ resources.encodingFormat|upper }} - {{resources.license}} -
- {% if resources.source == 'GESIS' or resources.source == 'Zenodo' %} -
- {{resources.datePublished}}
- {% endif %} - {% if resources.source == 'GEPRIS' %} +
- {{resources.dateLastModified}} -
- {% endif %} - {% if resources.source == 'CODALAB' %} -
- {{resources.dateCreated}} -
- {% endif %} - {% if resources.source == 'elg:corpus' or resources.source == 'elg:software/service'%} -
- {{resources.datePublished}} + {% if resources.source == 'GESIS' or resources.source[0].name == 'Zenodo' %} + {{ resources.datePublished }} + {% elif resources.source == 'GEPRIS' %} + {{ resources.dateLastModified }} + {% elif resources.source == 'CODALAB' %} + {{ resources.dateCreated }} + {% elif resources.source == 'elg:corpus' or resources.source == 'elg:software/service' %} + {{ resources.datePublished }} + {% endif %}
- {% endif %}
+
@@ -107,3 +130,18 @@
{% endfor %} + +
+
+ {% if session.displayed_search_results.resources + < session.total_search_results.resources %}
Displaying top {{ + session.displayed_search_results.resources }} resources out of + {{ session.total_search_results.resources }} +
+
+ + {% endif %} +
+
diff --git a/templates/resource-details.html b/templates/resource-details.html index 3035664..b7c20e0 100644 --- a/templates/resource-details.html +++ b/templates/resource-details.html @@ -1,6 +1,6 @@ {% extends "layouts/base.html" %} -{% block title %} Resources Details {% endblock title %} +{% block title %} resource Details {% endblock title %} {% block stylesheets %} @@ -15,31 +15,31 @@
-

Women and Politics

+

{{resource.name}}

- Politics - Social engagement - Politicians - Demography - Social Dataset +
+ {% for keyword in resource.keywords %} + {{keyword}} + {% endfor %} +
- Study number: + Version: ZA6719 DOI: - 10.4232 + {{resource.identifier}} Publication Date: - 03.10.2019 + {{resource.datePublished}}
@@ -47,22 +47,22 @@

Women and Politics

Downloads
-
1K+
+
--
Saved
-
420
+
--
Cited
-
300
+
--
Views
-
278
+
--
@@ -70,74 +70,82 @@

Women and Politics

-
-
-
AUTHORS (5)
-
-
- Author 1 -
-
- Author 2 -
-
- Author 3 +
+
- Author 4 +
AUTHORS ({{ resource.author|length }})
-
- Author 5 +
+ {% if resource.author|length > 0 %} + {% for author in resource.author %} + + {% endfor %} + {% else %} +
+

No authors available.

+
+ {% endif %}
-
-
ABSTRACT
+
+
ABSTRACT
- - The survey focused on women's interest in politics in general and in specific policy areas in particular, - their expectations of political parties and politicians, and their willingness to become politically or socially involved. - Of particular interest here was the question of the extent to which there are differences in attitudes, - experiences and expectations within the group of women. - Do women academics have other political interests than women with low formal education? - Or do the expectations that women in West Germany have of politics differ from those of women in East Germany? - It should also be ascertained whether women perceive female politicians differently from male politicians. - Are there characteristics that they attribute more to a male politician or more to a female politician? After all, - the issue of equal rights played a prominent role in the survey. - The perceived realisation of equal rights for men and women in Germany in general and in selected areas of life was measured. - - Topics: Political understanding and interest: interest in politics; particularly interesting political topics; - attitude towards elites (politics and political parties); frequency of discussions on political and social topics with various groups - (life partner or family, friends and acquaintances, in a public political event, colleagues, sports and hobby clubs, - on Facebook or in other social networks); political information behaviour (informing people who lack knowledge about a political topic, - intensive reflection on political issues, dealing with various party positions, - interest in the person in office of important political offices, own suggestions - for improvement with regard to living conditions in the place of residence); - attitude towards political styles: opinion on how parties and politicians deal with each other (should be considerate, - can be hard to deal with as long as it is content and not personal, - or occasionally personal attacks on politicians of other parties are fine); political discussion behaviour. + {{ resource.description}}
-
-
POLITICAL AND SOCIAL ENGAGEMENT
+
+
SUPPLEMENTAL MATERIAL
+
-
+ {% for file_name, file_url in resource.encoding_contentUrl.items() %} + + {% endfor %} + +
+
REFERENCES
-
-
REFERENCES
+
+
-
+
+
CITATIONS
+
+
+ +
+
+ -
+ + +
+
-
+
-
RECOMMENDATIONS
+
RECOMMENDATIONS
-
+
+ +
+ +
FAIR Assessment
- +
+ + Coming soon .... + +
Jupyter Lab
- +
+ + Coming soon .... + +
@@ -235,10 +267,101 @@
Jupyter Lab
$(document).ready(function () { + var tooltipTriggerList = [].slice.call(document.querySelectorAll('[data-bs-toggle="tooltip"]')) + var tooltipList = tooltipTriggerList.map(function (tooltipTriggerEl) { + return new bootstrap.Tooltip(tooltipTriggerEl) + }) }); - + load_references = function () { + let doi = document.getElementById('resource_doi').innerHTML + $.ajax({ + url: '/resource-details-references/' + doi, + type: "GET", + data: { + }, + beforeSend: function () { + $('#references_block').html("
"); + }, + complete: function () { + // $('.loader').remove(); + }, + success: function (data) { + console.log(data) + $('.loader').remove(); + $('#references_block').html(data); + console.log('references have been loaded.') + }, + error: function (err) { + console.log(err); + return err + } + }); + } + + load_recommendations = function () { + let doi = document.getElementById('resource_doi').innerHTML + $.ajax({ + url: '/resource-details-recommendations/' + doi, + type: "GET", + data: { + }, + beforeSend: function () { + $('#recommendations_block').html("
"); + }, + complete: function () { + $('.loader').remove(); + }, + success: function (data) { + console.log(data) + $('#recommendations_block').html(data); + console.log('recommendations have been loaded.') + + }, + error: function (err) { + console.log(err); + return err + } + }); + } + + load_citations = function () { + let doi = document.getElementById('resource_doi').innerHTML + $.ajax({ + url: '/resource-details-citations/' + doi, + type: "GET", + data: { + }, + beforeSend: function () { + $('#citations_block').html("
"); + }, + complete: function () { + $('.loader').remove(); + }, + success: function (data) { + console.log(data) + $('#citations_block').html(data); + console.log('citations have been loaded.') + }, + error: function (err) { + console.log(err); + return err + } + }); + } + + $('#btn-load-references').click(function () { + load_references(); + }); + + $('#btn-load-recommendations').click(function () { + load_recommendations(); + }); + + $('#btn-load-citations').click(function () { + load_citations(); + }); diff --git a/templates/results.html b/templates/results.html index 34fef38..e34b337 100644 --- a/templates/results.html +++ b/templates/results.html @@ -298,14 +298,73 @@ // }, 'slow'); // $('thead tr th:first-child').focus().blur(); - // }); + // }); + + // $('#btnShareLinkCopy').on('click', function (e) { + $('body').on('click', "#btnShareLinkCopy", function (e) { + var txtShareLink = document.getElementById('txtShareLink'); + txtShareLink.select(); + if (navigator.clipboard) { + alert(txtShareLink.value) + navigator.clipboard.writeText(txtShareLink.value) + .then(() => { + var notification = alertify.notify('Link copied', 'success', 5); + }) + .catch((error) => { + var notification = alertify.notify('Copy operation failed: ' + error, 'error', 5); + }); + } else { + var notification = alertify.notify('Copy operation failed', 'error', 5); + } + }); + + $('body').on('click', ".share_modal_link", function (e) { + // alert('pressed') + publication = $(this).closest("tr").find('.publication_name') + url = window.location.origin + $(publication).attr('href') + title = $(publication).text().trim() + $('#txtShareLink').val(url) + + $('.smd a').each(function (i, obj) { + $(this).attr('href', $(this).attr('base-href').replace('[[url]]', encodeURIComponent(url))) + $(this).attr('href', $(this).attr('href').replace('[[title]]', encodeURIComponent(title))) + }); + }); + + // $('#share_modal_dialog').on('shown.bs.modal', function (e) { + // // do something... + // }); + + $("body").on("shown.bs.modal", ".preview_modal_dialog", function (event) { + $(this).find('.modal-title').text(event.relatedTarget.getAttribute('data-bs-publ-name')); + $(this).find('.modal-body img').attr("src", event.relatedTarget.getAttribute('data-bs-img-src')); + }); + + $("body").on("shown.bs.modal", ".download_modal_dialog", function (event) { + $(this).find('.modal-title').text(event.relatedTarget.getAttribute('data-bs-publ-name')); + + var b = getBinary(event.relatedTarget.getAttribute('data-bs-embed-src')); + var b64 = base64Encode(b); + $(this).find('.modal-body iframe').attr("src", "data:application/pdf;base64," + b64); + }); + + $('body').on('click', "#btn_load_more_publications", function (e) { + console.log('load more publications button clicked') + load_more_publications() + }); + + $('body').on('click', "#btn_load_more_researchers", function (e) { + console.log('load more researchers button clicked') + load_more_researchers() + }); + $(window).scroll(function () { // console.log($(window).scrollTop(), ($(document).height() - $(window).height() - 500), $(document).height(), $(window).height(), ajax_request_sent) if ($('.nav-item .active').attr('id') == 'publications-tab') { - // Use the flag in the condition (so if sent and not yet received == false) + // Use the flag in the condition (so if sent and not yet received == false) if ($('#btn_load_more_publications').is(":visible") && //load more button is visible !ajax_request_sent && //no other ajax request is currently being processed $(window).scrollTop() >= $(document).height() - $(window).height() - 500) { // scroll is about to reach the bottom of the page @@ -320,7 +379,7 @@ } if ($('.nav-item .active').attr('id') == 'researchers-tab') { - // Use the flag in the condition (so if sent and not yet received == false) + // Use the flag in the condition (so if sent and not yet received == false) if ($('#btn_load_more_researchers').is(":visible") && //load more button is visible !ajax_request_sent && //no other ajax request is currently being processed $(window).scrollTop() >= $(document).height() - $(window).height() - 500) { // scroll is about to reach the bottom of the page @@ -334,6 +393,22 @@ } } + if ($('.nav-item .active').attr('id') == 'resources-tab') { + // Use the flag in the condition (so if sent and not yet received == false) + if ($('#btn_load_more_resources').is(":visible") && //load more button is visible + !ajax_request_sent && //no other ajax request is currently being processed + $(window).scrollTop() >= $(document).height() - $(window).height() - 500) { // scroll is about to reach the bottom of the page + + // Set the flag to prevent any concurring request + ajax_request_sent = true + + // ajax call get data from server and append to the div + console.log('load more via ajax now') + load_more_resources(); + } + } + + }); @@ -399,6 +474,35 @@ }); } + function load_more_resources() { + + $('#div_load_more_resources').remove() + jQuery.ajax({ + url: '/load-more-resources', + type: "GET", + beforeSend: function () { + $('#resources').append("
"); + }, + complete: function () { + $('.loader').remove(); + }, + // data: { + // action: "pj_load_more", + // pjCount: pjCount + // }, + success: function (data) { + $('#resources').append(data); + console.log('more resources loaded.') + + // Unset the flag + ajax_request_sent = false; + }, + error: function (err) { + console.log(err); + } + }); + } + diff --git a/utils.py b/utils.py index acea665..8cfc495 100644 --- a/utils.py +++ b/utils.py @@ -137,11 +137,11 @@ def parse_date(date_str): except (TypeError, ValueError): print(f"original date str: {date_str}") return "" - + # def sort_results_publications(results): -# def custom_sort_key(obj): -# desc = getattr(obj, 'description', '') -# pub_date = getattr(obj, 'datePublished', '0000-00-00') +# def custom_sort_key(obj): +# desc = getattr(obj, 'description', '') +# pub_date = getattr(obj, 'datePublished', '0000-00-00') # if desc == '': # return (0, pub_date) # return (1, pub_date) @@ -153,10 +153,10 @@ def sort_search_results(search_term, search_results): tokenized_results = [str(result).lower().split(" ") for result in search_results] if len(tokenized_results) > 0: bm25 = BM25Plus(tokenized_results) - + tokenized_query = search_term.lower().split(" ") doc_scores = bm25.get_scores(tokenized_query) - + for idx, doc_score in enumerate(doc_scores): search_results[idx].rankScore = doc_score @@ -168,6 +168,4 @@ def split_authors(authors_names, seperator, authors_list): _author = Author() _author.type = 'Person' _author.name = author - authors_list.append(_author) - - \ No newline at end of file + authors_list.append(_author) diff --git a/zen.py b/zen.py new file mode 100644 index 0000000..3c2853e --- /dev/null +++ b/zen.py @@ -0,0 +1,167 @@ +import requests +import utils +# from objects import Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Person, LearningResource, CreativeWork, VideoObject, ImageObject +from objects import thing, Article, Author, CreativeWork, Dataset, SoftwareApplication, VideoObject, ImageObject, LearningResource, Statistics +import logging +from sources import data_retriever +import traceback + +# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) +logger = logging.getLogger('nfdi_search_engine') + +@utils.timeit +def search(search_term, results): + + source = "Zenodo" + try: + search_result = data_retriever.retrieve_data(source=source, + base_url=utils.config["search_url_zenodo"], + search_term=search_term, + results=results) + + total_records_found = search_result.get("hits", {}).get("total", 0) + hits = search_result.get("hits", {}).get("hits", []) + total_hits = len(hits) + logger.info(f'{source} - {total_records_found} records matched; pulled top {total_hits}') + + if int(total_hits) > 0: + for hit in hits: + + metadata = hit.get('metadata', {}) + resource_type = metadata.get('resource_type', {}).get('type','OTHER').upper() + + if resource_type == 'PUBLICATION': + digitalObj = Article() + elif resource_type in ['PRESENTATION', 'POSTER']: + digitalObj = CreativeWork() + elif resource_type == 'DATASET': + digitalObj = Dataset() + elif resource_type == 'VIDEO': + digitalObj = VideoObject() + elif resource_type == 'IMAGE': + digitalObj = ImageObject() + elif resource_type == 'LESSON': + digitalObj = LearningResource() + elif resource_type == 'SOFTWARE': + digitalObj = SoftwareApplication() + elif resource_type == 'OTHER': + digitalObj = CreativeWork() + else: + print('This resource type is still not defined:', resource_type) + digitalObj = CreativeWork() + + digitalObj.identifier = hit.get('doi', '') + digitalObj.name = hit.get('title', '') + digitalObj.url = hit.get('links', {}).get('self', '') + digitalObj.genre = resource_type + digitalObj.description = utils.remove_html_tags(metadata.get('description', '')) + + keywords = metadata.get('keywords', []) + if isinstance(keywords, list): + # for keyword in keywords: + # digitalObj.keywords.append(keyword) + for keyword in keywords: + terms = [term.strip() for term in keyword.split(",")] + digitalObj.keywords.extend(terms) + + language = metadata.get('language', '') + digitalObj.inLanguage.append(language) + digitalObj.dateCreated = hit.get('created','') + digitalObj.dateModified = hit.get('modified','') + digitalObj.datePublished = metadata.get('publication_date', '') + digitalObj.license = metadata.get('license', {}).get('id', '') + digitalObj.creativeWorkStatus = hit.get('status','') + digitalObj.funder = metadata.get('grants', {}).get('funder', '').get('name','') + + #views, # resource type + digitalObj.conditionsOfAccess = metadata.get('access-rights','') + if(digitalObj.conditionsOfAccess == ''): + digitalObj.conditionsOfAccess = metadata.get('access_right','') + + authors = metadata.get("creators", []) + for author in authors: + _author = Author() + _author.type = 'Person' + _author.name = author.get("name", "") + _author.identifier = author.get("orcid", "") + _author.affiliation = author.get("affiliation", "") + digitalObj.author.append(_author) + + Stats = hit.get('stats', '') + _stats = Statistics() + + _stats.downloads = Stats.get("downloads", '') + _stats.unique_downloads = Stats.get("unique_downloads", '') + _stats.views = Stats.get("views", '') + _stats.unique_views = Stats.get("unique_views", '') + _stats.version_downloads = Stats.get("version_downloads", '') + _stats.version_unique_downloads = Stats.get("version_unique_downloads", '') + _stats.version_unique_views = Stats.get("version_unique_views", '') + _stats.version_views = Stats.get("version_views", '') + + digitalObj.stats = _stats + + # relation = metadata.get('related_identifiers', '').get('relation', '').lower() + # identifier = metadata.get('related_identifiers', '').get('identifier','').lower() + # relation_map = { + # 'iscitedby': 'isCitedBy', + # 'issupplementto': 'isSupplementTo', + # 'ispartof': 'isPartOf', + # 'cites': 'cites', + # 'issourceof': 'isSourceOf', + # 'isderivedfrom': 'isDerivedFrom', + # 'issupplementedby': 'isSupplementedBy', + # 'ispreviousversionof': 'isPreviousVersionOf', + # 'documents': 'documents', + # 'haspart': 'hasPart' + # } + # if relation in relation_map: + # getattr(digitalObj, relation_map[relation]).append(identifier) + + contributors = metadata.get("contributors", []) + for contributor in contributors: + _contributor = Author() + _contributor.type = 'Person' + _contributor.name = contributor.get("name", "") + _contributor.identifier = contributor.get("orcid", "") + _contributor.affiliation = contributor.get("affiliation", "") + digitalObj.contributor.append(_contributor) + + _source = thing() + _source.name = source + _source.identifier = hit.get("id", "") + _source.url = hit.get('links', {}).get('self_html', '') + digitalObj.source.append(_source) + + files = hit.get('files', []) + + # if resource_type == "LESSON": + for file in files: + file_key = file.get("key", "") + digitalObj.encoding_contentUrl[file_key] = file.get("links", {}).get("self", "") + + digitalObj.softwareVersion = metadata.get("version", "") + if resource_type.upper() == 'PUBLICATION': + digitalObj.abstract = digitalObj.description + a, b = hit.get("journal", "").get('pages','').split('-') + digitalObj.pageStart = a + digitalObj.pageEnd = b + digitalObj.pagination = hit.get("journal", "").get('pages','') + digitalObj.Jounral = metadata.get('journal').get('title', '') + digitalObj.JournalVolume = metadata.get('journal').get('volume', '') + digitalObj.issue = metadata.get('journal').get('issue', '') + ############################# + + results['publications'].append(digitalObj) + elif resource_type.upper() in ['PRESENTATION', 'POSTER', 'DATASET', 'SOFTWARE', 'VIDEO', 'IMAGE', 'LESSON']: + results['resources'].append(digitalObj) + else: + results['others'].append(digitalObj) + + except requests.exceptions.Timeout as ex: + logger.error(f'Timed out Exception: {str(ex)}') + results['timedout_sources'].append(source) + + except Exception as ex: + logger.error(f'Exception: {str(ex)}') + logger.error(traceback.format_exc()) \ No newline at end of file