Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
nooraangelva committed Jul 26, 2022
1 parent 1200334 commit 42b238b
Show file tree
Hide file tree
Showing 7 changed files with 660 additions and 1 deletion.
141 changes: 141 additions & 0 deletions inspire_utils/grobid_authors_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2020 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function
from inspire_schemas.builders import LiteratureBuilder
from parsel import Selector


class GrobidAuthors(object):
def __init__(self, xml_text):
if isinstance(xml_text, bytes):
xml_text = xml_text.decode('utf-8')

self._xml = Selector(text=xml_text, type="xml")
self._xml.remove_namespaces()
self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]")
self._builder = None

def __getitem__(self, item):
return GrobidAuthor(self._parsed_authors[item])

def __len__(self):
return len(self._parsed_authors)

def parse_one(self):
"""yield parsed authors one by one"""
self._builder = LiteratureBuilder()
for author in self:
yield {
'author': self._builder.make_author(
full_name=author.fullname,
raw_affiliations=author.raw_affiliations,
emails=author.emails,
),
'parsed_affiliations': author.processed_affiliations
}

def parse_all(self):
"""Returns all authors at once as a list"""
return list(self.parse_one())


class GrobidAuthor(object):
def __init__(self, author_selector):
self._author = author_selector

@staticmethod
def _extract(source, path, type=None, text=False):
path += "[string-length(normalize-space()) > 0]"
if type:
path += u"[@type='{}']".format(type)
if text:
path += "/text()"
return source.xpath(path)
return source.xpath(path)

@classmethod
def _extract_string(cls, source, path, type=None, join_char=u' '):
data = cls._extract(source, path, type, text=True).getall()
data = [text.strip() for text in data]
return join_char.join(data)

@classmethod
def _extract_strings_list(cls, source, path, type=None):
data = cls._extract(source, path, type, text=True).getall()
return [text.strip() for text in data]

@staticmethod
def _build_address(street, city, post_code, country):
address_list = [element for element in [street, city, post_code, country] if element]
address = {"postal_address": ', '.join(address_list)} if address_list else {}
if city:
address['cities'] = [city]
if post_code:
address['postal_code'] = post_code
if country:
address['country'] = country
return address

@property
def names(self):
return self._extract_string(self._author, "persName/forename")

@property
def lastname(self):
return self._extract_string(self._author, "persName/surname")

@property
def fullname(self):
return u",".join([self.lastname, self.names])

@property
def raw_affiliations(self):
return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation")

@property
def emails(self):
return self._extract_strings_list(self._author, "email")

@property
def processed_affiliations(self):
affiliations = []
for affiliation in self._extract(self._author, "affiliation"):
affiliation_obj = {}
name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ')
department = self._extract_strings_list(affiliation, "orgName", type="department")

street = self._extract_string(affiliation, 'address/addrLine')
settlement = self._extract_string(affiliation, 'address/settlement')
post_code = self._extract_string(affiliation, 'address/post_code')
country = self._extract_string(affiliation, 'address/country')

address = self._build_address(street, settlement, post_code, country)

if name:
affiliation_obj['name'] = name
if department:
affiliation_obj['department'] = department
if address:
affiliation_obj['address'] = address
affiliations.append(affiliation_obj)
return affiliations or None
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@
'python-dateutil~=2.0,>=2.6.1',
'six~=1.0,>=1.10.0',
'elasticsearch==7.1.0',
'elasticsearch-dsl~=7.1'
'elasticsearch-dsl~=7.1',
'inspire-schemas==61.4.12',
'parsel>=1.5'
]

docs_require = []
Expand Down
69 changes: 69 additions & 0 deletions tests/fixtures/grobid_empty_author_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first"> FIRST </forename>
<surname></surname>
</persName>
<email> [email protected] </email>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">XYZ</forename>
<surname>ABC</surname>
</persName>
<email> </email>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first"> </forename>
<surname>YZC</surname>
</persName>
<email> [email protected] </email>
</author>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
109 changes: 109 additions & 0 deletions tests/fixtures/grobid_full_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Partha</forename>
<surname>Nandi</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
<address>
<addrLine>Sector III, Salt Lake</addrLine>
<settlement>Kolkata-700106</settlement>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sankarshan</forename>
<surname>Sahu</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff1">
<note type="raw_affiliation">
<label>2</label> Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India.
</note>
<orgName type="department">Indian Institute of Engineering Science and Technology</orgName>
<address>
<postCode>Bengal-711103</postCode>
<settlement>Shibpur, Howrah</settlement>
<region>West</region>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sayan</forename>
<forename type="middle">Kumar</forename>
<surname>Pal</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
<address>
<addrLine>Sector III, Salt Lake</addrLine>
<settlement>Kolkata-700106</settlement>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
Loading

0 comments on commit 42b238b

Please sign in to comment.