Merge pull request #132 from MaRDI4NFDI/openml

Openml
MaRDI4NFDI · Mar 21, 2024 · 9e2e975 · 9e2e975
2 parents 76b54d0 + dc6e441
commit 9e2e975
Show file tree

Hide file tree

Showing 9 changed files with 831 additions and 1 deletion.
diff --git a/mardi_importer/mardi_importer/openml/OpenMLDataset.py b/mardi_importer/mardi_importer/openml/OpenMLDataset.py
diff --git a/mardi_importer/mardi_importer/openml/OpenMLPublication.py b/mardi_importer/mardi_importer/openml/OpenMLPublication.py
@@ -0,0 +1,62 @@
+import sys
+
+class OpenMLPublication:
+    """ Class to manage OpenML publications in the local Wikibase instance.
+    If there is already an item with this doi or arxiv id, it gets fetched.
+    Attributes:
+        integrator:
+            MardiIntegrator instance
+        identifier:
+            arxiv id or doi
+        identifier_type:
+            'arxiv' or 'doi'
+    """
+
+    def __init__(
+        self,
+        integrator,
+        identifier,
+        identifier_type,
+    ):
+        self.api = integrator
+        self.identifier = identifier
+        self.identifier_type = identifier_type
+        self.item = self.api.item.new()
+
+    def exists(self):
+        """Checks if there is an item with that identifier in the local wikibase instance.
+        Returns:
+            String: Entity ID
+        """
+        if self.identifier_type == "doi":
+            QID_list = self.api.search_entity_by_value(
+                    "wdt:P356", self.identifier
+                )
+        elif self.identifier_type == "arxiv":
+            QID_list = self.api.search_entity_by_value(
+                    "wdt:P818", self.identifier
+                )
+        else:
+            sys.exit(f"Invalid identifier type {self.identifier_type}")
+        if not QID_list:
+            self.QID = None
+        else:
+            self.QID = QID_list[0]
+        if self.QID:
+            print(f"Publication with qid {self.QID} exists")
+        return(self.QID)
+
+    def create(self):
+        self.item.add_claim("wdt:P31", "wd:Q13442814")
+        if self.identifier_type == "doi":
+            self.item.add_claim("wdt:P356", self.identifier)
+        elif self.identifier_type == "arxiv":
+            self.item.add_claim("wdt:P356", self.identifier)
+        profile_prop = self.api.get_local_id_by_label("MaRDI profile type", "property")
+        profile_target = self.api.get_local_id_by_label("MaRDI publication profile", "property")
+        self.item.add_claim(profile_prop, profile_target)
+        self.item.descriptions.set(language="en", value=f"scientific article about an OpenML dataset")
+        publication_id = self.item.write().id
+        print(f"Publication with the qid {publication_id} has been created.")
+        return publication_id
+
diff --git a/mardi_importer/mardi_importer/openml/OpenMLSource.py b/mardi_importer/mardi_importer/openml/OpenMLSource.py
@@ -0,0 +1,184 @@
+from mardi_importer.importer import ADataSource
+import openml
+from mardi_importer.integrator import MardiIntegrator
+from .OpenMLDataset import OpenMLDataset
+import os
+import json
+from itertools import zip_longest
+
+class OpenMLSource(ADataSource):
+    def __init__(self):
+        self.integrator = MardiIntegrator()
+        self.filepath = os.path.realpath(os.path.dirname(__file__))
+    def setup(self):
+        """Create all necessary properties and entities for zbMath"""
+        # Import entities from Wikidata
+        filename = self.filepath + "/wikidata_entities.txt"
+        self.integrator.import_entities(filename=filename)
+        self.create_local_entities()
+        # self.de_number_prop = self.integrator.get_local_id_by_label(
+        #     "zbMATH DE Number", "property"
+        # )
+        # self.keyword_prop = self.integrator.get_local_id_by_label(
+        #     "zbMATH keyword string", "property"
+        # )
+
+    def create_local_entities(self):
+        filename = self.filepath + "/new_entities.json"
+        f = open(filename)
+        entities = json.load(f)
+
+        for prop_element in entities["properties"]:
+            prop = self.integrator.property.new()
+            prop.labels.set(language="en", value=prop_element["label"])
+            prop.descriptions.set(language="en", value=prop_element["description"])
+            prop.datatype = prop_element["datatype"]
+            if not prop.exists():
+                prop.write()
+
+        for item_element in entities["items"]:
+            item = self.integrator.item.new()
+            item.labels.set(language="en", value=item_element["label"])
+            item.descriptions.set(language="en", value=item_element["description"])
+            for key, value in item_element["claims"].items():
+                item.add_claim(key, value=value)
+            if not item.exists():
+                item.write()
+
+    def pull(self):
+        dataset_dict = {"name": [], "dataset_id": [], "version": [], "creators": [],
+                        "contributors": [], "collection_date": [], "upload_date": [], 
+                        "license": [], "url":[], "default_target_attribute":[], "row_id_attribute":[],
+                        "tags":[], "original_data_url":[], "paper_url":[],
+                        "md5_checksum": [], "features": [], "num_binary_features":[],
+                        "num_classes":[], "num_features":[], "num_instances":[], "num_instances_missing_vals":[],
+                        "num_missing_vals":[], "num_numeric_features":[], "num_symbolic_features":[], 
+                        "format":[]}
+        dataset_df = openml.datasets.list_datasets(output_format="dataframe")
+        did_list = dataset_df["did"].unique()
+        for did in did_list:
+            try:
+                ds = openml.datasets.get_dataset(int(did), download_data=False)
+            except Exception as e:
+                try:
+                    ds = openml.datasets.get_dataset(int(did), download_data=False, download_qualities=False)
+                except Exception as e:
+                    ds = openml.datasets.get_dataset(int(did), download_data=False, download_qualities=False, download_features_meta_data=False)
+            dataset_dict["name"].append(ds.name)
+            dataset_dict["dataset_id"].append(did)
+            dataset_dict["version"].append(ds.version)
+            dataset_dict["creators"].append(ds.creator)
+            dataset_dict["contributors"].append(ds.contributor)
+            dataset_dict["collection_date"].append(ds.collection_date)
+            dataset_dict["upload_date"].append(ds.upload_date)
+            dataset_dict["license"].append(ds.licence)
+            dataset_dict["url"].append(ds.url)
+            dataset_dict["default_target_attribute"].append(ds.default_target_attribute)
+            dataset_dict["row_id_attribute"].append(ds.row_id_attribute)
+            dataset_dict["tags"].append(ds.tag)
+            dataset_dict["original_data_url"].append(ds.original_data_url)
+            dataset_dict["paper_url"].append(ds.paper_url)
+            dataset_dict["md5_checksum"].append(ds.md5_checksum)
+            try:
+                dataset_dict["features"].append(ds.features)
+            except:
+                dataset_dict["features"].append(None)
+            try:
+                qualities = ds.qualities
+            except:
+                dataset_dict["num_binary_features"].append(None)
+                dataset_dict["num_classes"].append(None)
+                dataset_dict["num_features"].append(None)
+                dataset_dict["num_instances"].append(None)
+                dataset_dict["num_instances_missing_vals"].append(None)
+                dataset_dict["num_missing_vals"].append(None)
+                dataset_dict["num_numeric_features"].append(None)
+                dataset_dict["num_symbolic_features"].append(None)
+                continue
+            if 'NumberOfBinaryFeatures' in ds.qualities:
+                dataset_dict["num_binary_features"].append(ds.qualities['NumberOfBinaryFeatures'])
+            else:
+                dataset_dict["num_binary_features"].append(None)
+            if  'NumberOfClasses' in ds.qualities:
+                dataset_dict["num_classes"].append(ds.qualities['NumberOfClasses'])
+            else:
+                dataset_dict["num_classes"].append(None)
+            if  'NumberOfFeatures' in ds.qualities:
+                dataset_dict["num_features"].append(ds.qualities['NumberOfFeatures'])
+            else:
+                dataset_dict["num_features"].append(None)
+            if  'NumberOfInstances' in ds.qualities:
+                dataset_dict["num_instances"].append(ds.qualities['NumberOfInstances'])
+            else:
+                dataset_dict["num_instances"].append(None)
+            if  'NumberOfInstancesWithMissingValues' in ds.qualities:
+                dataset_dict["num_instances_missing_vals"].append(ds.qualities['NumberOfInstancesWithMissingValues'])
+            else:
+                dataset_dict["num_instances_missing_vals"].append(None)
+            if  'NumberOfMissingValues' in ds.qualities:
+                dataset_dict["num_missing_vals"].append(ds.qualities['NumberOfMissingValues'])
+            else:
+                dataset_dict["num_missing_vals"].append(None)
+            if  'NumberOfNumericFeatures' in ds.qualities:
+                dataset_dict["num_numeric_features"].append(ds.qualities['NumberOfNumericFeatures'])
+            else:
+                dataset_dict["num_numeric_features"].append(None)
+            if  'NumberOfSymbolicFeatures' in ds.qualities:
+                dataset_dict["num_symbolic_features"].append(ds.qualities['NumberOfSymbolicFeatures'])
+            else:
+                dataset_dict["num_symbolic_features"].append(None)
+        return(dataset_dict)
+
+    def push(self):
+        import pickle
+        with open('/data/dataset_dict.p', 'rb') as handle:
+            dataset_dict = pickle.load(handle)
+        # dataset_dict = {'name': ['kr-vs-kp'],
+        #             'description': ['Author: Alen Shapiro\nSource: [UCI](https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn))\nPlease cite: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n1. Title: Chess End-Game -- King+Rook versus King+Pawn on a7\n(usually abbreviated KRKPA7). The pawn on a7 means it is one square\naway from queening. It is the King+Rook\'s side (white) to move.\n\n2. Sources:\n(a) Database originally generated and described by Alen Shapiro.\n(b) Donor/Coder: Rob Holte ([email protected]). The database\nwas supplied to Holte by Peter Clark of the Turing Institute\nin Glasgow ([email protected]).\n(c) Date: 1 August 1989\n\n3. Past Usage:\n- Alen D. Shapiro (1983,1987), "Structured Induction in Expert Systems",\nAddison-Wesley. This book is based on Shapiro\'s Ph.D. thesis (1983)\nat the University of Edinburgh entitled "The Role of Structured\nInduction in Expert Systems".\n- Stephen Muggleton (1987), "Structuring Knowledge by Asking Questions",\npp.218-229 in "Progress in Machine Learning", edited by I. Bratko\nand Nada Lavrac, Sigma Press, Wilmslow, England SK9 5BB.\n- Robert C. Holte, Liane Acker, and Bruce W. Porter (1989),\n"Concept Learning and the Problem of Small Disjuncts",\nProceedings of IJCAI. Also available as technical report AI89-106,\nComputer Sciences Department, University of Texas at Austin,\nAustin, Texas 78712.\n\n4. Relevant Information:\nThe dataset format is described below. Note: the format of this\ndatabase was modified on 2/26/90 to conform with the format of all\nthe other databases in the UCI repository of machine learning databases.\n\n5. Number of Instances: 3196 total\n\n6. Number of Attributes: 36\n\n7. Attribute Summaries:\nClasses (2): -- White-can-win ("won") and White-cannot-win ("nowin").\nI believe that White is deemed to be unable to win if the Black pawn\ncan safely advance.\nAttributes: see Shapiro\'s book.\n\n8. Missing Attributes: -- none\n\n9. Class Distribution:\nIn 1669 of the positions (52%), White can win.\nIn 1527 of the positions (48%), White cannot win.\n\nThe format for instances in this database is a sequence of 37 attribute values.\nEach instance is a board-descriptions for this chess endgame. The first\n36 attributes describe the board. The last (37th) attribute is the\nclassification: "win" or "nowin". There are 0 missing values.\nA typical board-description is\n\nf,f,f,f,f,f,f,f,f,f,f,f,l,f,n,f,f,t,f,f,f,f,f,f,f,t,f,f,f,f,f,f,f,t,t,n,won\n\nThe names of the features do not appear in the board-descriptions.\nInstead, each feature correponds to a particular position in the\nfeature-value list. For example, the head of this list is the value\nfor the feature "bkblk". The following is the list of features, in\nthe order in which their values appear in the feature-value list:\n\n[bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,cntxt,dsopp,dwipd,\nhdchk,katri,mulch,qxmsq,r2ar8,reskd,reskr,rimmx,rkxwp,rxmsq,simpl,skach,skewr,\nskrxp,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg]\n\nIn the file, there is one instance (board position) per line.\n\n\nNum Instances: 3196\nNum Attributes: 37\nNum Continuous: 0 (Int 0 / Real 0)\nNum Discrete: 37\nMissing values: 0 / 0.0%'],
+        #             'dataset_id': [3],
+        #             'version': [1],
+        #             'creators': ['Alen Shapiro'],
+        #             'contributors': ['Rob Holte'],
+        #             'collection_date': ['1989-08-01'],
+        #             'upload_date': ['2014-04-06T23:19:28'],
+        #             'license': ['CC0'],
+        #             'url': ['https://api.openml.org/data/v1/download/3/kr-vs-kp.arff'],
+        #             'default_target_attribute': ['class'],
+        #             'row_id_attribute': [None],
+        #             'tags': [['Machine Learning',
+        #                 'Mathematics',
+        #                 'mythbusting_1',
+        #                 'OpenML-CC18',
+        #                 'OpenML100',
+        #                 'study_1',
+        #                 'study_123',
+        #                 'study_14',
+        #                 'study_144',
+        #                 'uci']],
+        #             'original_data_url': ['https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn)'],
+        #             'paper_url': ['https://dl.acm.org/doi/abs/10.5555/32231'],
+        #             'md5_checksum': ['ad6eb32b7492524d4382a40e23cdbb8e'],
+        #             'features': [{0: ["0 - bkblk (nominal)"],
+        #                         1: ["1 - bknwy (nominal)"],
+        #                         2: ["2 - bkon8 (nominal)"],
+        #                         3: ["3 - bkona (nominal)"],
+        #                         36: ["36 - class (nominal)"]}],
+        #             'num_binary_features': [35.0],
+        #             'num_classes': [2.0],
+        #             'num_features': [37.0],
+        #             'num_instances': [3196.0],
+        #             'num_instances_missing_vals': [0.0],
+        #             'num_missing_vals': [0.0],
+        #             'num_numeric_features': [0.0],
+        #             'num_symbolic_features': [37.0],
+        #             'format': ['ARFF']}
+        for items in zip_longest(*[dataset_dict[key] for key in dataset_dict], fillvalue=None):
+            lookup_dict = dict(zip(dataset_dict.keys(), items))
+            dataset = OpenMLDataset(
+                integrator = self.integrator,
+                **lookup_dict
+            )
+            if not dataset.exists():
+                dataset.create()
+            else:
+                dataset.update()
diff --git a/mardi_importer/mardi_importer/openml/__init__.py b/mardi_importer/mardi_importer/openml/__init__.py
@@ -0,0 +1 @@
+from .OpenMLSource import OpenMLSource
diff --git a/mardi_importer/mardi_importer/openml/misc.py b/mardi_importer/mardi_importer/openml/misc.py
@@ -0,0 +1,21 @@
+from datetime import datetime
+
+def convert_time_to_iso(time_string):
+    try:
+        # Attempt to parse the input time string
+        dt = datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S")
+    except ValueError:
+        try:
+            dt = datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")
+        except ValueError:
+            try:
+                dt = datetime.strptime(time_string, "%Y-%m-%d")
+            except ValueError:
+                # Handle other formats as needed
+                print(time_string)
+                print(a)
+                return None
+
+    # Convert to ISO format
+    iso_format = dt.isoformat() + "Z"
+    return iso_format
diff --git a/mardi_importer/mardi_importer/openml/new_entities.json b/mardi_importer/mardi_importer/openml/new_entities.json
@@ -0,0 +1,98 @@
+{
+    "properties": [
+        {
+            "label": "dataset version",
+            "description": "Version of a dataset",
+            "datatype": "string"
+        },
+        {
+            "label": "collection date",
+            "description": "date as a string",
+            "datatype": "string"
+        },
+        {
+            "label": "upload date",
+            "description": "upload date of file",
+            "datatype": "time"
+        },
+        {
+            "label": "default target attribute",
+            "description": "the default target attribute",
+            "datatype": "string"
+        },
+        {
+            "label": "row id attribute",
+            "description": "the row id attribute",
+            "datatype": "string"
+        },
+        {
+            "label": "OpenML semantic tag",
+            "description": "the OpenML semantic tag",
+            "datatype": "string"
+        },
+        {
+            "label": "has feature",
+            "description": "it has feature",
+            "datatype": "string"
+        },
+        {
+            "label": "data type",
+            "description": "the data type",
+            "datatype": "string"
+        },
+        {
+            "label": "number of binary features",
+            "description": "the number of binary features",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of classes",
+            "description": "the number of classes",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of features",
+            "description": "the number of features",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of instances",
+            "description": "the number of instances",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of instances with missing values",
+            "description": "the number of instances with missing values",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of missing values",
+            "description": "the number of missing values",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of numeric features",
+            "description": "the number of numeric features",
+            "datatype": "quantity"
+        },
+        {
+            "label": "number of symbolic features",
+            "description": "the number of symbolic features",
+            "datatype": "quantity"
+        },
+        {
+            "label": "citation text",
+            "description": "free-form text about citation",
+            "datatype": "string"
+        }
+    ],
+    "items": [
+        {
+            "label": "Sparse ARFF",
+            "description": "File format",
+            "claims": {
+                "wdt:P31": "wd:Q235557"
+            }
+        }
+    ]
+}
diff --git a/mardi_importer/mardi_importer/openml/wikidata_entities.txt b/mardi_importer/mardi_importer/openml/wikidata_entities.txt
@@ -0,0 +1,14 @@
+Q1172284
+P2701
+P4092
+Q185235
+Q4489412
+P356
+P818
+Q28130012
+Q27017232
+Q6938433
+Q13442814
+Q185235
+P459
+P11238