-
Notifications
You must be signed in to change notification settings - Fork 0
/
simple_convertor.py
130 lines (112 loc) · 3.71 KB
/
simple_convertor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import sqlite3
from rdflib import Graph, Namespace, Literal, RDF, RDFS
from mlcroissant import Dataset, Records, Field
from datetime import datetime
import json
# Connect to SQLite database
dbname = 'dsc.db'
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
# Replace with your actual table name
table_name = 'var'
cursor.execute(f"PRAGMA table_info('{table_name}')")
columns = cursor.fetchall()
# Create RDF graph
g = Graph()
DB = Namespace("http://cbs.nl/db#")
g.bind("db", DB)
# Add triples for table metadata
table_uri = DB[table_name]
g.add((table_uri, RDF.type, DB.Table))
g.add((table_uri, RDFS.label, Literal(table_name)))
for col_id, name, col_type, not_null, default, primary_key in columns:
column_uri = DB[name]
g.add((column_uri, RDF.type, DB.Column))
g.add((column_uri, RDFS.label, Literal(name)))
g.add((column_uri, DB.columnType, Literal(col_type)))
g.add((column_uri, DB.notNull, Literal(bool(not_null))))
g.add((column_uri, DB.defaultValue, Literal(default if default else "None")))
g.add((column_uri, DB.primaryKey, Literal(bool(primary_key))))
g.add((table_uri, DB.hasColumn, column_uri))
# Fetch table data
cursor.execute(f"SELECT * FROM {table_name}")
rows = cursor.fetchall()
# Add table data to the graph
for row_index, row in enumerate(rows):
row_uri = DB[f"{table_name}_row_{row_index + 1}"]
g.add((row_uri, RDF.type, DB.Row))
g.add((table_uri, DB.hasRow, row_uri))
for col_index, value in enumerate(row):
col_name = columns[col_index][1] # Get column name
column_uri = DB[col_name]
g.add((row_uri, column_uri, Literal(value)))
# Serialize the graph as RDF triples
output_file = "output_triples.ttl"
g.serialize(output_file, format="turtle")
print(f"RDF triples saved to {output_file}")
# Add SPARQL query to inspect noteDefinition
query = """
SELECT ?subject ?predicate ?object
WHERE {
?subject db:noteDefinition ?object .
}
LIMIT 10
"""
print("\nQuerying noteDefinition:")
for row in g.query(query):
print(f"Subject: {row.subject}")
print(f"Object: {row.object}")
print("---")
# Close database connection
conn.close()
def map_sql_to_data_type(sql_type):
"""Map SQL types to Croissant data types"""
sql_type = sql_type.upper()
type_mapping = {
'INTEGER': 'integer',
'TEXT': 'string',
'REAL': 'float',
'BLOB': 'string',
'VARCHAR': 'string',
'BOOLEAN': 'boolean'
}
return type_mapping.get(sql_type, 'string')
# Create Croissant dataset using mlcroissant
dataset = Dataset({
"@context": {
"@vocab": "http://schema.org/",
"sc": "http://schema.org/",
"cr": "http://mlcommons.org/croissant/"
},
"@type": ["https://schema.org/Dataset", "sc:Dataset"],
"https://schema.org/name": table_name,
"datePublished": datetime.now().strftime("%Y-%m-%d"),
"https://schema.org/version": "1.0.0",
"license": "https://creativecommons.org/licenses/by/4.0/",
"citation": {
"@type": "CreativeWork",
"name": f"Citation for {table_name} dataset",
"url": f"http://cbs.nl/db#{table_name}"
}
})
# Create fields list
fields = []
for col_id, name, col_type, not_null, default, primary_key in columns:
field = Field(
name=name,
description=f"Column {name} from {table_name}",
data_types=map_sql_to_data_type(col_type)
)
fields.append(field)
# Create Records with
records = Records(
dataset=dataset,
record_set=fields,
filters=[],
debug=False
)
# Save the Croissant metadata
output_file = f"{table_name}_croissant.json"
with open(output_file, 'w') as f:
json.dump(dataset.jsonld, f, indent=2) # Use the jsonld property of the Dataset
print(f"Croissant metadata saved to {output_file}")