Skip to content

Commit

Permalink
Add sample and populate
Browse files Browse the repository at this point in the history
  • Loading branch information
evgepab committed Jun 25, 2024
1 parent 2413938 commit c8d96d3
Showing 1 changed file with 29 additions and 24 deletions.
53 changes: 29 additions & 24 deletions src/alexandria3k/data_sources/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,14 @@
# pylint: disable=invalid-name
# pylint: disable=too-many-lines


def dict_value(dictionary, key):
"""Return the value of dictionary for key or None if it doesn't exist"""
if not dictionary:
return None
return dictionary.get(key)


def float_value(string):
"""Return the float value of a string or None if None is passed"""
return float(string) if string else None
Expand Down Expand Up @@ -95,12 +97,11 @@ class WorksCursor(DataciteElementsCursor):

def __init__(self, table):
super().__init__(table, None)
self.files_cursor = TarFilesCursor(
table, table.data_source.get_file_path()
)
self.files_cursor = TarFilesCursor(table)
# Initialized in Filter()
self.item_index = None
self.cached_json_item_index = None
self.json_data = None

def element_name(self):
"""The work key from which to retrieve the elements. Not part of the
Expand Down Expand Up @@ -480,6 +481,7 @@ def Column(self, col):
ColumnMeta("container_id"),
ColumnMeta("work_id"),
ColumnMeta("name", lambda row: dict_value(row, "name")),
ColumnMeta("name_type", lambda row: dict_value(row, "nameType")),
ColumnMeta("given_name", lambda row: dict_value(row, "givenName")),
ColumnMeta(
"family_name", lambda row: dict_value(row, "familyName")
Expand Down Expand Up @@ -654,13 +656,11 @@ def Column(self, col):
ColumnMeta("work_id"),
ColumnMeta("container_id"),
ColumnMeta("rights", lambda row: dict_value(row, "rights")),
ColumnMeta(
"lang", lambda row: dict_value(row, "lang")
),
ColumnMeta("lang", lambda row: dict_value(row, "lang")),
ColumnMeta("rights_uri", lambda row: dict_value(row, "rightsUri")),
ColumnMeta(
"rights_identifier",
lambda row: dict_value(row, "rightsIdentifier")
"rights_identifier",
lambda row: dict_value(row, "rightsIdentifier"),
),
ColumnMeta(
"rights_identifier_scheme",
Expand Down Expand Up @@ -706,12 +706,14 @@ def Column(self, col):
[
float_value(
dict_value(
dict_value(row, "geoLocationPoint"), "pointLongitude"
dict_value(row, "geoLocationPoint"),
"pointLongitude",
)
),
float_value(
dict_value(
dict_value(row, "geoLocationPoint"), "pointLatitude"
dict_value(row, "geoLocationPoint"),
"pointLatitude",
)
),
]
Expand All @@ -723,22 +725,26 @@ def Column(self, col):
[
float_value(
dict_value(
dict_value(row, "geoLocationBox"), "westBoundLongitude"
dict_value(row, "geoLocationBox"),
"westBoundLongitude",
)
),
float_value(
dict_value(
dict_value(row, "geoLocationBox"), "eastBoundLongitude"
dict_value(row, "geoLocationBox"),
"eastBoundLongitude",
)
),
float_value(
dict_value(
dict_value(row, "geoLocationBox"), "southBoundLatitude"
dict_value(row, "geoLocationBox"),
"southBoundLatitude",
)
),
float_value(
dict_value(
dict_value(row, "geoLocationBox"), "northBoundLatitude"
dict_value(row, "geoLocationBox"),
"northBoundLatitude",
)
),
]
Expand Down Expand Up @@ -778,6 +784,7 @@ def Column(self, col):
]


# pylint: disable-next=too-many-instance-attributes
class TarFiles:
"""The source of the files residing in the tar.gz file"""

Expand All @@ -787,12 +794,13 @@ def __init__(
sample,
):
self.file_path = file_path
self.sample = sample # TODO
self.sample = sample
self.doi_prefix = None
self.data_files = []
self.file_index = -1
self.reader = None
self.cached_file_contents_index = None
self.cached_file_contents = None
self.generator = self.tar_file_generator()

try:
Expand All @@ -811,11 +819,12 @@ def tar_file_generator(self):
continue
# Obtain DOI prefix from file name to avoid extraction and parsing
(_dot, doi_prefix, file_name) = self.tar_info.name.split("/")
if not self.sample(file_name):
continue
self.doi_prefix = doi_prefix
self.data_files.append(doi_prefix + '/' + file_name)
self.data_files.append(doi_prefix + "/" + file_name)
self.file_index += 1
yield self.file_index
yield None

def get_file_contents(self, file_index):
"""Return the contents of the file at the specified index"""
Expand All @@ -828,15 +837,11 @@ def get_file_contents(self, file_index):
self.bytes_read += len(self.cached_file_contents)
self.cached_file_contents_index = self.file_index
return self.cached_file_contents
index = next(self.generator)
if index is None:
return None
next(self.generator)
except tarfile.ReadError as e:
if "unexpected end of data" in str(e):
return None
else:
raise
except StopIteration as e:
except StopIteration:
return None

def get_bytes_read(self):
Expand Down Expand Up @@ -867,7 +872,7 @@ class TarFilesCursor(ItemsCursor):
"""A cursor that iterates over the elements in a tar file
Not used directly by an SQLite table"""

def __init__(self, table, file_path):
def __init__(self, table):
"""Not part of the apsw VTCursor interface.
The table argument is a StreamingTable object"""
super().__init__(table)
Expand Down

0 comments on commit c8d96d3

Please sign in to comment.