Skip to content

Commit

Permalink
Merge pull request #151 from MaRDI4NFDI/dump_and_upload_fixes
Browse files Browse the repository at this point in the history
Dump and upload fixes
  • Loading branch information
LizzAlice authored Oct 1, 2024
2 parents c400347 + 47a4df5 commit d650a10
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 13 deletions.
12 changes: 10 additions & 2 deletions mardi_importer/mardi_importer/integrator/MardiEntities.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
import sqlalchemy as db
from sqlalchemy import and_
from sqlalchemy import and_, case

from mardiclient import MardiItem, MardiProperty
from wikibaseintegrator.wbi_exceptions import ModificationFailed
Expand Down Expand Up @@ -28,6 +28,11 @@ def get_QID(self, alias=False):
label = ""
if 'en' in self.labels.values:
label = self.labels.values['en'].value
label = bytes(label, "utf-8")
is_truncated = False
if len(label) > 250:
label = label[:250]
is_truncated = True

def query_wikidata_table(field_type):
# field_type = 1 : Label
Expand All @@ -53,7 +58,10 @@ def query_wikidata_table(field_type):
.join(wbt_term_in_lang, wbt_item_terms.columns.wbit_term_in_lang_id == wbt_term_in_lang.columns.wbtl_id)
.join(wbt_text_in_lang, wbt_term_in_lang.columns.wbtl_text_in_lang_id == wbt_text_in_lang.columns.wbxl_id)
.join(wbt_text, wbt_text.columns.wbx_id == wbt_text_in_lang.columns.wbxl_text_id)
.where(and_(wbt_text.columns.wbx_text == bytes(label, "utf-8"),
.where(and_(
case(
(is_truncated, wbt_text.columns.wbx_text.like(label + b"%")),
else_=wbt_text.columns.wbx_text == label),
wbt_term_in_lang.columns.wbtl_type_id == field_type,
wbt_text_in_lang.columns.wbxl_language == bytes("en", "utf-8"))))
results = connection.execute(query).fetchall()
Expand Down
2 changes: 1 addition & 1 deletion mardi_importer/mardi_importer/scripts/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main(**args):
processed_dump_path=conf["processed_dump_path"],
)
importer = Importer(data_source)
importer.import_all(pull=True, push=False)
importer.import_all(pull=False, push=True)

elif args["mode"] == "OpenML":
# if args["conf_path"] is None:
Expand Down
30 changes: 20 additions & 10 deletions mardi_importer/mardi_importer/zbmath/ZBMathSource.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def process_data(self):
record["classifications"] = ";".join(msc)
if literal_eval(row["language"])["languages"]:
record["language"] = literal_eval(row["language"])["languages"][0]
else:
record["language"] = None
links = []
doi = None
for d in literal_eval(row["links"]):
Expand All @@ -268,22 +270,30 @@ def process_data(self):
record["publication_year"] = row["year"]
if literal_eval(row["source"])["series"]:
record["serial"] = literal_eval(row["source"])["series"][0]["title"]
else:
record["serial"] = None
record["zbl_id"] = row["identifier"]
ref_ids = []
for d in literal_eval(row["references"]):
ref_ids.append(str(d["zbmath"]["document_id"]))
record["references"] = ";".join(ref_ids)
review_text = None
review_sign = None
reviewer_id = None
for d in literal_eval(row["editorial_contributions"]):
if d["contribution_type"] == "review":
review_text = d["text"]
row["review_text"] = review_text
row["review_sign"] = d["reviewer"]["name"]
row["reviewer_id"] = d["reviewer"]["author_code"]
review_sign = d["reviewer"]["name"]
reviewer_id = d["reviewer"]["author_code"]
break
record["review_text"] = review_text
record["review_sign"] = review_sign
record["reviewer_id"] = reviewer_id

if record:
for key, value in record.items():
if isinstance(value, str):
record[key] = value.replace("\t", " ").replace("\n", " ")
record[key] = value.replace("\t", "\T").replace("\n", "\N").replace("\r", "\R")
outfile.write(
"\t".join(str(x) for x in record.values()) + "\n"
)
Expand Down Expand Up @@ -420,12 +430,12 @@ def push(self):
info_dict = dict(zip(headers, split_line))
# this part is for continuing at a certain position if the import failed
# if not found:
# if info_dict["de_number"].strip() != " ":
# if info_dict["document_title"] != "Unimodular supergravity":
# continue
# else:
# found = True
# continue
# if info_dict["de_number"].strip() != "49686":
# #if info_dict["document_title"] != "Unimodular supergravity":
# continue
# else:
# found = True
# continue
# if there is not title, don't add
if self.conflict_string in info_dict["document_title"]:
if (
Expand Down
9 changes: 9 additions & 0 deletions mardi_importer/mardi_importer/zbmath/new_entities.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,21 @@
"label": "zbMATH Keywords",
"description": "keyword string from zbMATH",
"datatype": "string"
},
{
"label": "MaRDI profile type",
"description": "defines the types of MaRDI profiles expected to work for this item",
"datatype": "wikibase-item"
}
],
"items": [
{
"label": "MaRDI person profile",
"description": "type of MaRDI profile"
},
{
"label": "MaRDI publication profile",
"description": "type of MaRDI profile"
}
]
}

0 comments on commit d650a10

Please sign in to comment.