Skip to content

Commit

Permalink
Updated example
Browse files Browse the repository at this point in the history
  • Loading branch information
lllangWV committed Oct 22, 2024
1 parent d93fff1 commit c386fe3
Showing 1 changed file with 32 additions and 14 deletions.
46 changes: 32 additions & 14 deletions examples/scripts/Example 1 - 3D Alexandria Database.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def normalize_dataset(db, **kwargs):
# # Here we create a ParquetDatasetDB object to interact with the database
db=ParquetDB(dataset_name='alexandria_3D',dir=base_dir)

print(f"Dataset dir: {db.dataset_dir}")


# print(f"Dataset dir: {db.dataset_dir}")

benchmark_dict={
'create_times':[],
Expand All @@ -63,7 +61,7 @@ def normalize_dataset(db, **kwargs):
if len(os.listdir(db.dataset_dir))==0:
print("The dataset does not exist. Creating it.")
json_files=glob(os.path.join(alexandria_dir,'*.json'))
for json_file in json_files:
for json_file in json_files[:2]:

start_time = time.time()
data = read_json(json_file)
Expand Down Expand Up @@ -112,6 +110,15 @@ def normalize_dataset(db, **kwargs):
print('-'*200)


# Here we read all the records from the database, but only read the 'id' column
start_time = time.time()
table=read_dataset(db,
columns=['id'],
load_format='table')
end_time = time.time() - start_time
benchmark_dict['read_single_column_time']=end_time
print(table.shape)
print('-'*200)


# Here we read a record from the database with id of 0
Expand All @@ -120,9 +127,11 @@ def normalize_dataset(db, **kwargs):
ids=[0,10,100,1000,10000,100000,1000000], # Controls which rows we want to read
load_format='table' # Controls the output format. The options are 'table', 'batches', `dataset`.
)

read_time = time.time() - start_time
benchmark_dict['read_ids_time']=read_time
df=table.to_pandas() # Converts the table to a pandas dataframe
df=table.to_pandas() # Converts the table to a pandas dataframe\
print(df['id'])
print(df.head())
print(df.shape)

Expand All @@ -131,15 +140,6 @@ def normalize_dataset(db, **kwargs):
print('-'*200)


# Here we read all the records from the database, but only read the 'id' column
start_time = time.time()
table=read_dataset(db,
columns=['id'],
load_format='table')
end_time = time.time() - start_time
benchmark_dict['read_single_column_time']=end_time
print(table.shape)
print('-'*200)

# With only some subset of columns, we can use built in pyarrow functions to calculate statistics of our column
start_time = time.time()
Expand Down Expand Up @@ -250,5 +250,23 @@ def normalize_dataset(db, **kwargs):
json.dump(benchmark_dict, f, indent=4)


table=db.read(ids=[0])
df=table.to_pandas()
print(df['data.spg'])

db.update([{'id':0, 'data.spg':204}],
normalize_kwargs={
"load_format":'batches', # Uses the batch generator to normalize
"load_kwargs":{'batch_readahead': 10, # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
'fragment_readahead': 2, # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
},
"batch_size":100000}
)

table=db.read(ids=[0])
df=table.to_pandas()
print(df['data.spg'])




0 comments on commit c386fe3

Please sign in to comment.