Merge branch 'main' into ci

sentier-dev · Oct 10, 2024 · f67eae9 · f67eae9
2 parents 330fbd5 + 29b468a
commit f67eae9
Show file tree

Hide file tree

Showing 10 changed files with 36,660 additions and 63 deletions.
diff --git a/sentier_vocab/add_terms.py b/sentier_vocab/add_terms.py
@@ -1,8 +1,10 @@
-from .ordered_serialization import OrderedTurtleSerializer
 from pathlib import Path
+
+import skosify
 from rdflib import Graph, Literal, Namespace, URIRef
 from rdflib.namespace import DCTERMS, RDF, RDFS, SKOS
-import skosify
+
+from .ordered_serialization import OrderedTurtleSerializer
 
 VAEM = Namespace("http://www.linkedmodel.org/schema/vaem")
 QUDTS = Namespace("http://qudt.org/schema/qudt/")
@@ -11,25 +13,25 @@
 
 
 COMMON_PREDICATES = {
-    'broader': SKOS.broader,
-    'narrower': SKOS.narrower,
-    'prefLabel': SKOS.prefLabel,
-    'altLabel': SKOS.altLabel,
-    'hiddenLabel': SKOS.hiddenLabel,
-    'notation': SKOS.notation,
-    'definition': SKOS.definition,
-    'related': SKOS.related,
-    'exactMatch': SKOS.exactMatch,
-    'closeMatch': SKOS.closeMatch,
-    'inScheme': SKOS.inScheme,
-    'isDefinedBy': RDFS.isDefinedBy,
-    'isReplacedBy': DCTERMS.isReplacedBy,
-    'type': RDF.type,
-    'hasQuantityKind': QUDTS.hasQuantityKind,
-    'hasDimensionVector': QUDTS.hasDimensionVector,
-    'conversionMultiplier': QUDTS.conversionMultiplier,
-    'conversionMultiplier': QUDTS.conversionMultiplier,
-    'conversionMultiplierSN': QUDTS.conversionMultiplierSN,
+    "broader": SKOS.broader,
+    "narrower": SKOS.narrower,
+    "prefLabel": SKOS.prefLabel,
+    "altLabel": SKOS.altLabel,
+    "hiddenLabel": SKOS.hiddenLabel,
+    "notation": SKOS.notation,
+    "definition": SKOS.definition,
+    "related": SKOS.related,
+    "exactMatch": SKOS.exactMatch,
+    "closeMatch": SKOS.closeMatch,
+    "inScheme": SKOS.inScheme,
+    "isDefinedBy": RDFS.isDefinedBy,
+    "isReplacedBy": DCTERMS.isReplacedBy,
+    "type": RDF.type,
+    "hasQuantityKind": QUDTS.hasQuantityKind,
+    "hasDimensionVector": QUDTS.hasDimensionVector,
+    "conversionMultiplier": QUDTS.conversionMultiplier,
+    "conversionMultiplier": QUDTS.conversionMultiplier,
+    "conversionMultiplierSN": QUDTS.conversionMultiplierSN,
 }
 OBJECT_TYPES_FOR_PREDICATES = {
     SKOS.broader: Literal,
@@ -53,8 +55,8 @@
     QUDTS.conversionMultiplierSN: URIRef,
 }
 COMMON_OBJECTS = {
-    'Concept': SKOS.Concept,
-    'ConceptScheme': SKOS.ConceptScheme,
+    "Concept": SKOS.Concept,
+    "ConceptScheme": SKOS.ConceptScheme,
 }
 
 
@@ -124,7 +126,9 @@ def add_custom_terms(data: list[dict], namespace: str, filename: str) -> Path:
             raise ValueError(f"Object {o} can be translated into correct form")
 
         if object_type is not None and not isinstance(object_, object_type):
-            raise ValueError(f"Object {object_} has incorrect type for this function; should be {type(object_type)} but got {type(object_)}")
+            raise ValueError(
+                f"Object {object_} has incorrect type for this function; should be {type(object_type)} but got {type(object_)}"
+            )
 
         graph.add((subject, predicate, object_))
 
@@ -134,7 +138,7 @@ def add_custom_terms(data: list[dict], namespace: str, filename: str) -> Path:
 
     output_path = (Path(__file__).parent / "output" / filename).with_suffix(".ttl")
     serializer = OrderedTurtleSerializer(graph)
-    with open(output_path, 'wb') as fp:
+    with open(output_path, "wb") as fp:
         serializer.serialize(fp)
 
     return output_path
diff --git a/sentier_vocab/envo.py b/sentier_vocab/envo.py
@@ -112,4 +112,4 @@ def write_graph(
 
 
 if __name__ == "__main__":
-    ENVO().write_graph(dirpath = Path(__file__).parent / "output")
+    ENVO().write_graph(dirpath=Path(__file__).parent / "output")
diff --git a/sentier_vocab/geonames_iri_terms.py b/sentier_vocab/geonames_iri_terms.py
@@ -0,0 +1,110 @@
+import os
+import zipfile
+from urllib.request import urlretrieve
+
+import polars as pl
+import sentier_data_tools as sdt
+from rdflib import Graph, Literal, Namespace, URIRef
+from rdflib.namespace import RDF, SKOS, XSD
+from skosify import infer
+
+"""
+The data for this was found at geonames' site, but it's much too large to put onto git.
+For the data used to generate the dataframe for the entire world, look here:
+384MB, unpacks to 1.6 GB https://download.geonames.org/export/dump/allCountries.zip
+For the hierarchy dataframe, look here:
+2MB, unpacks to 9MB https://download.geonames.org/export/dump/hierarchy.zip
+
+set the world_path to where you stored allCountries.txt, and hierarchy_path to wherever hierarchy.txt is.
+"""
+
+
+def generateGeonameVocabulary(world_path: str, hierarchy_path: str):
+
+    # ##  THIS PART FETCHES AND EXTRACTS.
+    # temp_dir = os.path.join(os.curdir,"temp")
+    # if not os.path.exists(temp_dir):
+    #     os.mkdir(temp_dir)
+
+    # hier_zip = os.path.realpath(os.path.join(temp_dir,"hierarchy.zip"))
+    # hierarchy_path = os.path.realpath(os.path.join(temp_dir,"hierarchy.txt"))
+    # hierarchy_url = "https://download.geonames.org/export/dump/hierarchy.zip"
+
+    # world_zip = os.path.realpath(os.path.join(temp_dir,"allCountries.zip"))
+    # world_path = os.path.realpath(os.path.join(temp_dir,"allCountries.txt"))
+    # world_url = "https://download.geonames.org/export/dump/allCountries.zip"
+
+    # urlretrieve(hierarchy_url,hier_zip)
+
+    # urlretrieve(world_url,world_zip)
+
+    # with zipfile.ZipFile(hier_zip, 'r') as zip_ref:
+    #     zip_ref.extractall(temp_dir)
+
+    # with zipfile.ZipFile(world_zip, 'r') as zip_reff:
+    #     zip_reff.extractall(temp_dir)
+
+    # ##FETCHING AND EXTRACTING COMPLETED
+
+    GEOSPACES = "https://sws.geonames.org/"
+    GN = Namespace("http://www.geonames.org/ontology#")
+
+    all_schema = pl.Schema(
+        {
+            "geonameid": pl.Int64,
+            "name": pl.String,
+            "asciiname": pl.String,
+            "alternatenames": pl.String,
+            "latitude": pl.Float32,
+            "longitude": pl.Float32,
+            "feature_class": pl.String,
+            "feature_code": pl.String,
+            "country_code": pl.String,
+            "cc2": pl.String,
+            "admin1_code": pl.String,
+            "admin2_code": pl.String,
+            "admin3_code": pl.String,
+            "admin4_code": pl.String,
+            "population": pl.Int64,
+            "elevation": pl.Int16,
+            "dem": pl.Int64,
+            "timezone": pl.String,
+            "modification_date": pl.Date,
+        }
+    )
+
+    world_frame = pl.scan_csv(
+        source=world_path, has_header=False, separator="\t", schema=all_schema
+    )
+
+    ##In the SQL here you can actually expand or narrow what you're going to model.
+    ##See more at https://download.geonames.org/export/dump/readme.txt, scroll down to "feature classes"
+    ##to isolate only countries, use "where feature_code = 'PCLI'"
+
+    hierarchy_schema = pl.Schema({"parent": pl.Int64, "child": pl.Int64, "admin1_code": pl.String})
+
+    hierarchy = pl.scan_csv(hierarchy_path, schema=hierarchy_schema, separator="\t")
+
+    filtered_world = world_frame.sql(
+        "select * from self where feature_code in ('PCLI', 'ADM1', 'RGN')"
+    ).collect()
+
+    world = Graph()
+
+    for item in filtered_world.iter_rows():
+        uri = URIRef(GEOSPACES + str(item[0]))
+        pref_name = Literal(item[1])
+        alt_names = []
+        # if item[3]:
+        #   alt_names = item[3].split(",")
+        world.add((uri, RDF.type, SKOS.Concept))
+        world.add((uri, SKOS.prefLabel, pref_name))
+        world.add((uri, GN.countryCode, Literal(item[8])))
+        children = hierarchy.sql(f"select * from self where parent = {item[0]}").collect()
+        if len(children) > 0:
+            for child in children.iter_rows():
+                if not filtered_world.filter(pl.col("geonameid") == child[1]).is_empty():
+                    world.add((uri, SKOS.narrower, URIRef(GEOSPACES + str(child[1]))))
+
+    infer.skos_hierarchical(world)
+    world.serialize(destination="output/geonames-iri.ttl")
diff --git a/sentier_vocab/graph_base.py b/sentier_vocab/graph_base.py
@@ -55,9 +55,7 @@ def skosify_checks(self):
     def get_identifier(self, uri: URIRef) -> str:
         return uri.split("/")[-1]
 
-    def write_graph(
-        self, filename: str, dirpath: Path | None = None
-    ) -> Path:
+    def write_graph(self, filename: str, dirpath: Path | None = None) -> Path:
         if not filename.endswith(".ttl"):
             filename += ".ttl"
         if not dirpath:

diff --git a/sentier_vocab/input/custom_products.py b/sentier_vocab/input/custom_products.py
@@ -1,5 +1,5 @@
-from rdflib.namespace import RDFS, SKOS, RDF
-from rdflib import URIRef, Namespace, Literal
+from rdflib import Literal, Namespace, URIRef
+from rdflib.namespace import RDF, RDFS, SKOS
 
 PRODUCTS = Namespace("https://vocab.sentier.dev/products/")
 
@@ -8,35 +8,69 @@
     ("electrolyzer", "broader", "http://data.europa.eu/xsp/cn2024/854330700080"),
     ("electrolyzer", "prefLabel", "Electrolyzer", "en-US"),
     ("electrolyzer", "prefLabel", "Electrolyzer", "en-GB"),
-    ("electrolyzer", "definition", "An electrolyzer is a machine that uses electricity to drive a chemical reaction.", "en"),
+    (
+        "electrolyzer",
+        "definition",
+        "An electrolyzer is a machine that uses electricity to drive a chemical reaction.",
+        "en",
+    ),
     ("electrolyzer", "related", "https://en.wikipedia.org/wiki/Electrolysis"),
     ("aec-electrolyzer", "type", "Concept"),
     ("aec-electrolyzer", "broader", PRODUCTS + "electrolyzer"),
     ("aec-electrolyzer", "prefLabel", "Alkaline Electrolysis Cell Electrolyzer", "en"),
-    ("aec-electrolyzer", "definition", "An electrolyser with two electrodes operating in a liquid alkaline electrolyte.", "en"),
+    (
+        "aec-electrolyzer",
+        "definition",
+        "An electrolyser with two electrodes operating in a liquid alkaline electrolyte.",
+        "en",
+    ),
     ("aec-electrolyzer", "related", "https://en.wikipedia.org/wiki/Alkaline_water_electrolysis"),
     ("pem-electrolyzer", "type", "Concept"),
     ("pem-electrolyzer", "broader", PRODUCTS + "electrolyzer"),
     ("pem-electrolyzer", "prefLabel", "Proton Exchange Membrane Electrolyser", "en-GB"),
     ("pem-electrolyzer", "prefLabel", "Proton Exchange Membrane Electrolyzer", "en-US"),
-    ("pem-electrolyzer", "definition", "An electrolyser with a solid polymer electrolyte and a proton exchange membrane.", "en"),
-    ("pem-electrolyzer", "related", "https://en.wikipedia.org/wiki/Proton_exchange_membrane_electrolysis"),
+    (
+        "pem-electrolyzer",
+        "definition",
+        "An electrolyser with a solid polymer electrolyte and a proton exchange membrane.",
+        "en",
+    ),
+    (
+        "pem-electrolyzer",
+        "related",
+        "https://en.wikipedia.org/wiki/Proton_exchange_membrane_electrolysis",
+    ),
     ("soel-electrolyzer", "type", "Concept"),
     ("soel-electrolyzer", "broader", PRODUCTS + "electrolyzer"),
     ("soel-electrolyzer", "prefLabel", "Solid Oxide Electrolyzer", "en"),
-    ("soel-electrolyzer", "definition", "A solid oxide fuel cell that runs in regenerative mode to achieve the electrolysis of water.", "en"),
+    (
+        "soel-electrolyzer",
+        "definition",
+        "A solid oxide fuel cell that runs in regenerative mode to achieve the electrolysis of water.",
+        "en",
+    ),
     ("soel-electrolyzer", "related", "https://en.wikipedia.org/wiki/Solid_oxide_electrolyzer_cell"),
     # Missing from Combined Nomenclature
     # tetraflouroethylene, not poly-
     ("tetrafluoroethylene", "type", "Concept"),
     ("tetrafluoroethylene", "broader", "http://data.europa.eu/xsp/cn2024/290349000080"),
     ("tetrafluoroethylene", "prefLabel", "Tetrafluoroethylene", "en"),
     ("tetrafluoroethylene", "related", "https://en.wikipedia.org/wiki/Tetrafluoroethylene"),
-    ("tetrafluoroethylene", "definition", "Tetrafluoroethylene (TFE) is a fluorocarbon with the chemical formula C2F4. It is the simplest perfluorinated alkene. This gaseous species is used primarily in the industrial preparation of fluoropolymers (from Wikipedia)", "en"),
+    (
+        "tetrafluoroethylene",
+        "definition",
+        "Tetrafluoroethylene (TFE) is a fluorocarbon with the chemical formula C2F4. It is the simplest perfluorinated alkene. This gaseous species is used primarily in the industrial preparation of fluoropolymers (from Wikipedia)",
+        "en",
+    ),
     # Zeolite
     ("zeolite", "type", "Concept"),
     ("zeolite", "broader", "http://data.europa.eu/xsp/cn2024/382400000080"),
     ("zeolite", "prefLabel", "Zeolite", "en"),
     ("zeolite", "related", "https://en.wikipedia.org/wiki/Zeolite"),
-    ("zeolite", "definition", "Zeolite is a family of several microporous, crystalline aluminosilicate materials commonly used as commercial adsorbents and catalysts (from Wikipedia)", "en"),
+    (
+        "zeolite",
+        "definition",
+        "Zeolite is a family of several microporous, crystalline aluminosilicate materials commonly used as commercial adsorbents and catalysts (from Wikipedia)",
+        "en",
+    ),
 ]
diff --git a/sentier_vocab/open_energy_ontology.py b/sentier_vocab/open_energy_ontology.py
@@ -1,12 +1,13 @@
-from rdflib import Graph, Namespace, URIRef
-from sentier_vocab.utils import get_file_in_downloadable_zip_archive
-from sentier_vocab.graph_base import GraphBase
-from rdflib.namespace import RDFS, SKOS, RDF
 import skosify
 from loguru import logger
+from rdflib import Graph, Namespace, URIRef
+from rdflib.namespace import RDF, RDFS, SKOS
+
+from sentier_vocab.graph_base import GraphBase
+from sentier_vocab.utils import get_file_in_downloadable_zip_archive
 
 OEO = Namespace("http://openenergy-platform.org/ontology/oeo/")
-OBO = Namespace('http://purl.obolibrary.org/obo/')
+OBO = Namespace("http://purl.obolibrary.org/obo/")
 
 MATCHES = {
     # Hydrogen
@@ -34,7 +35,6 @@
     URIRef(OEO + "OEO_00010382"): URIRef(OEO + "OEO_00010381"),
     # Fossil steam reforming hydrogen with CCS
     URIRef(OEO + "OEO_00010383"): URIRef(OEO + "OEO_00010381"),
-
     ### Electricity
     # renewable electrical energy
     URIRef(OEO + "OEO_00010384"): URIRef("http://data.europa.eu/xsp/cn2024/271600000080"),
@@ -57,8 +57,7 @@ class OpenEnergyProducts(GraphBase):
     def __init__(self, graph: Graph | None = None):
         logger.info("Parsing and creating Open Energy Ontology elements")
         data = get_file_in_downloadable_zip_archive(
-            "https://openenergyplatform.org/ontology/oeo/releases/latest",
-            "oeo-full.owl"
+            "https://openenergyplatform.org/ontology/oeo/releases/latest", "oeo-full.owl"
         )
         self.input_graph = Graph().parse(data, format="xml")
         new_graph = graph is None
Original file line number	Diff line number	Diff line change
Expand Up		@@ -112,4 +112,4 @@ def write_graph(


		if __name__ == "__main__":
		ENVO().write_graph(dirpath = Path(__file__).parent / "output")
		ENVO().write_graph(dirpath=Path(__file__).parent / "output")