Updated example

lllangWV · Oct 22, 2024 · c386fe3 · c386fe3
1 parent d93fff1
commit c386fe3
Showing 1 changed file with 32 additions and 14 deletions.
diff --git a/examples/scripts/Example 1 - 3D Alexandria Database.py b/examples/scripts/Example 1 - 3D Alexandria Database.py
@@ -48,9 +48,7 @@ def normalize_dataset(db, **kwargs):
     # # Here we create a ParquetDatasetDB object to interact with the database
     db=ParquetDB(dataset_name='alexandria_3D',dir=base_dir)
 
-    print(f"Dataset dir: {db.dataset_dir}")
-
-
+    # print(f"Dataset dir: {db.dataset_dir}")
 
     benchmark_dict={
         'create_times':[],
@@ -63,7 +61,7 @@ def normalize_dataset(db, **kwargs):
     if len(os.listdir(db.dataset_dir))==0:
         print("The dataset does not exist. Creating it.")
         json_files=glob(os.path.join(alexandria_dir,'*.json'))
-        for json_file in json_files:
+        for json_file in json_files[:2]:
 
             start_time = time.time()
             data = read_json(json_file)
@@ -112,6 +110,15 @@ def normalize_dataset(db, **kwargs):
     print('-'*200)
 
 
+    # Here we read all the records from the database, but only read the 'id' column
+    start_time = time.time()
+    table=read_dataset(db,
+                       columns=['id'], 
+                       load_format='table')
+    end_time = time.time() - start_time
+    benchmark_dict['read_single_column_time']=end_time
+    print(table.shape)
+    print('-'*200)
 
 
     # Here we read a record from the database with id of 0
@@ -120,9 +127,11 @@ def normalize_dataset(db, **kwargs):
                        ids=[0,10,100,1000,10000,100000,1000000], # Controls which rows we want to read
                        load_format='table' # Controls the output format. The options are 'table', 'batches', `dataset`.
                        )
+
     read_time = time.time() - start_time
     benchmark_dict['read_ids_time']=read_time
-    df=table.to_pandas() # Converts the table to a pandas dataframe
+    df=table.to_pandas() # Converts the table to a pandas dataframe\
+    print(df['id'])
     print(df.head())
     print(df.shape)
 
@@ -131,15 +140,6 @@ def normalize_dataset(db, **kwargs):
     print('-'*200)
 
 
-    # Here we read all the records from the database, but only read the 'id' column
-    start_time = time.time()
-    table=read_dataset(db,
-                       columns=['id'], 
-                       load_format='table')
-    end_time = time.time() - start_time
-    benchmark_dict['read_single_column_time']=end_time
-    print(table.shape)
-    print('-'*200)
 
     # With only some subset of columns, we can use built in pyarrow functions to calculate statistics of our column
     start_time = time.time()
@@ -250,5 +250,23 @@ def normalize_dataset(db, **kwargs):
         json.dump(benchmark_dict, f, indent=4)
 
 
+    table=db.read(ids=[0])
+    df=table.to_pandas()
+    print(df['data.spg'])
+
+    db.update([{'id':0, 'data.spg':204}], 
+              normalize_kwargs={
+            "load_format":'batches',      # Uses the batch generator to normalize
+            "load_kwargs":{'batch_readahead': 10,   # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
+                        'fragment_readahead': 2,  # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
+                        },
+            "batch_size":100000}
+          )
+
+    table=db.read(ids=[0])
+    df=table.to_pandas()
+    print(df['data.spg'])
+
+