Merge pull request #151 from MaRDI4NFDI/dump_and_upload_fixes

Dump and upload fixes
MaRDI4NFDI · Oct 1, 2024 · d650a10 · d650a10
2 parents c400347 + 47a4df5
commit d650a10
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 13 deletions.
diff --git a/mardi_importer/mardi_importer/integrator/MardiEntities.py b/mardi_importer/mardi_importer/integrator/MardiEntities.py
@@ -1,6 +1,6 @@
 import re
 import sqlalchemy as db
-from sqlalchemy import and_
+from sqlalchemy import and_, case
 
 from mardiclient import MardiItem, MardiProperty
 from wikibaseintegrator.wbi_exceptions import ModificationFailed
@@ -28,6 +28,11 @@ def get_QID(self, alias=False):
         label = ""
         if 'en' in self.labels.values:
             label = self.labels.values['en'].value
+        label = bytes(label, "utf-8")
+        is_truncated = False
+        if len(label) > 250:
+            label = label[:250]
+            is_truncated = True
 
         def query_wikidata_table(field_type):
             # field_type = 1 : Label
@@ -53,7 +58,10 @@ def query_wikidata_table(field_type):
                             .join(wbt_term_in_lang, wbt_item_terms.columns.wbit_term_in_lang_id == wbt_term_in_lang.columns.wbtl_id)
                             .join(wbt_text_in_lang, wbt_term_in_lang.columns.wbtl_text_in_lang_id == wbt_text_in_lang.columns.wbxl_id)
                             .join(wbt_text, wbt_text.columns.wbx_id == wbt_text_in_lang.columns.wbxl_text_id)
-                            .where(and_(wbt_text.columns.wbx_text == bytes(label, "utf-8"), 
+                            .where(and_(
+                                        case(
+                                           (is_truncated, wbt_text.columns.wbx_text.like(label + b"%")),
+                                           else_=wbt_text.columns.wbx_text == label), 
                                         wbt_term_in_lang.columns.wbtl_type_id == field_type,
                                         wbt_text_in_lang.columns.wbxl_language == bytes("en", "utf-8"))))
                     results = connection.execute(query).fetchall()

diff --git a/mardi_importer/mardi_importer/scripts/import.py b/mardi_importer/mardi_importer/scripts/import.py
@@ -42,7 +42,7 @@ def main(**args):
             processed_dump_path=conf["processed_dump_path"],
         )
         importer = Importer(data_source)
-        importer.import_all(pull=True, push=False)
+        importer.import_all(pull=False, push=True)
 
     elif args["mode"] == "OpenML":
         # if args["conf_path"] is None:

diff --git a/mardi_importer/mardi_importer/zbmath/ZBMathSource.py b/mardi_importer/mardi_importer/zbmath/ZBMathSource.py
@@ -252,6 +252,8 @@ def process_data(self):
                     record["classifications"] = ";".join(msc)
                     if literal_eval(row["language"])["languages"]:
                         record["language"] = literal_eval(row["language"])["languages"][0]
+                    else:
+                        record["language"] = None
                     links = []
                     doi = None
                     for d in literal_eval(row["links"]):
@@ -268,22 +270,30 @@ def process_data(self):
                     record["publication_year"] = row["year"]
                     if literal_eval(row["source"])["series"]:
                         record["serial"] = literal_eval(row["source"])["series"][0]["title"]
+                    else:
+                        record["serial"] = None
                     record["zbl_id"] = row["identifier"]
                     ref_ids = []
                     for d in literal_eval(row["references"]):
                         ref_ids.append(str(d["zbmath"]["document_id"]))
                     record["references"] = ";".join(ref_ids)
+                    review_text = None
+                    review_sign = None
+                    reviewer_id = None
                     for d in literal_eval(row["editorial_contributions"]): 
                         if d["contribution_type"] == "review":
                             review_text = d["text"]
-                            row["review_text"] = review_text
-                            row["review_sign"] = d["reviewer"]["name"]
-                            row["reviewer_id"] = d["reviewer"]["author_code"]
+                            review_sign = d["reviewer"]["name"]
+                            reviewer_id = d["reviewer"]["author_code"]
                             break
+                    record["review_text"] = review_text
+                    record["review_sign"] = review_sign
+                    record["reviewer_id"] = reviewer_id
+
                     if record:
                         for key, value in record.items():
                             if isinstance(value, str):
-                                record[key] = value.replace("\t", " ").replace("\n", " ")
+                                record[key] = value.replace("\t", "\T").replace("\n", "\N").replace("\r", "\R")
                         outfile.write(
                             "\t".join(str(x) for x in record.values()) + "\n"
                         )
@@ -420,12 +430,12 @@ def push(self):
                 info_dict = dict(zip(headers, split_line))
                 # this part is for continuing at a certain position if the import failed
                 # if not found:
-                #     if info_dict["de_number"].strip() != " ":
-                #     if info_dict["document_title"] != "Unimodular supergravity":
-                #         continue
-                #     else:
-                #         found = True
-                #         continue
+                #     if info_dict["de_number"].strip() != "49686":
+                #     #if info_dict["document_title"] != "Unimodular supergravity":
+                    #     continue
+                    # else:
+                    #     found = True
+                    #     continue
                 # if there is not title, don't add
                 if self.conflict_string in info_dict["document_title"]:
                     if (

diff --git a/mardi_importer/mardi_importer/zbmath/new_entities.json b/mardi_importer/mardi_importer/zbmath/new_entities.json
@@ -14,12 +14,21 @@
             "label": "zbMATH Keywords",
             "description": "keyword string from zbMATH",
             "datatype": "string"
+        },
+        {
+            "label": "MaRDI profile type",
+            "description": "defines the types of MaRDI profiles expected to work for this item",
+            "datatype": "wikibase-item"
         }
     ],
     "items": [
         {
             "label": "MaRDI person profile",
             "description": "type of MaRDI profile"
+        },
+        {
+            "label": "MaRDI publication profile",
+            "description": "type of MaRDI profile"
         }
     ]
 }