From 43f1194078c2f7d99c587a9920311704ada1639e Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 24 Jul 2024 18:41:52 +0000 Subject: [PATCH] Deployed 04ae7ec to stable with MkDocs 1.6.0 and mike 2.1.2 --- stable/404.html | 111 +- stable/API/basedataset.html | 447 +- stable/API/datasets/alchemy.html | 111 +- stable/API/datasets/ani.html | 111 +- stable/API/datasets/comp6.html | 111 +- stable/API/datasets/des.html | 111 +- stable/API/datasets/gdml.html | 111 +- stable/API/datasets/geom.html | 111 +- stable/API/datasets/iso_17.html | 111 +- stable/API/datasets/l7.html | 111 +- stable/API/datasets/md22.html | 111 +- stable/API/datasets/metcalf.html | 111 +- stable/API/datasets/molecule3d.html | 111 +- stable/API/datasets/multixcqm9.html | 111 +- stable/API/datasets/nabladft.html | 111 +- stable/API/datasets/orbnet_denali.html | 111 +- stable/API/datasets/pcqm.html | 111 +- stable/API/datasets/proteinfragments.html | 111 +- stable/API/datasets/qm1b.html | 111 +- stable/API/datasets/qm7x.html | 111 +- stable/API/datasets/qmugs.html | 111 +- stable/API/datasets/qmx.html | 111 +- stable/API/datasets/revmd17.html | 111 +- stable/API/datasets/sn2_rxn.html | 111 +- stable/API/datasets/solvated_peptides.html | 111 +- stable/API/datasets/spice.html | 111 +- stable/API/datasets/splinter.html | 111 +- stable/API/datasets/tmqm.html | 111 +- stable/API/datasets/transition1x.html | 111 +- stable/API/datasets/vqm24.html | 111 +- stable/API/datasets/waterclusters.html | 111 +- stable/API/datasets/waterclusters3_30.html | 111 +- stable/API/datasets/x40.html | 111 +- stable/API/e0_dispatcher.html | 3955 ++++++++++++++++ stable/API/formats.html | 113 +- stable/API/methods.html | 111 +- stable/API/properties.html | 2546 ++++++++++ stable/API/regressor.html | 111 +- stable/API/statistics.html | 4930 ++++++++++++++++++++ stable/API/units.html | 111 +- stable/API/utils.html | 111 +- stable/cli.html | 111 +- stable/contribute.html | 111 +- stable/data_storage.html | 111 +- stable/dataset_upload.html | 111 +- stable/datasets.html | 111 +- stable/index.html | 119 +- stable/licensing.html | 111 +- stable/normalization_e0s.html | 111 +- stable/objects.inv | Bin 2671 -> 3647 bytes stable/search/search_index.json | 2 +- stable/sitemap.xml | 109 +- stable/sitemap.xml.gz | Bin 553 -> 581 bytes stable/tutorials/usage.html | 111 +- stable/usage.html | 116 +- 55 files changed, 16990 insertions(+), 231 deletions(-) create mode 100644 stable/API/e0_dispatcher.html create mode 100644 stable/API/properties.html create mode 100644 stable/API/statistics.html diff --git a/stable/404.html b/stable/404.html index 5ed97fe..57281b7 100644 --- a/stable/404.html +++ b/stable/404.html @@ -694,12 +694,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/basedataset.html b/stable/API/basedataset.html index 0df59e1..97e0d17 100644 --- a/stable/API/basedataset.html +++ b/stable/API/basedataset.html @@ -16,7 +16,7 @@ - + @@ -24,7 +24,7 @@ - Main class - OpenQDC + BaseDataset - OpenQDC @@ -109,7 +109,7 @@
    - Main class + BaseDataset
    @@ -714,6 +714,46 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • @@ -1983,6 +2103,15 @@ +
  • + +
  • + + + e0s_dispatcher + + +
  • @@ -2179,7 +2308,7 @@ -

    Main class

    +

    BaseDataset

    @@ -2214,7 +2343,7 @@

    - Bases: DatasetPropertyMixIn

    + Bases: DatasetPropertyMixIn

    Base class for datasets in the openQDC package.

    @@ -2900,7 +3029,13 @@

    753 754 755 -756

    class BaseDataset(DatasetPropertyMixIn):
    +756
    +757
    +758
    +759
    +760
    +761
    +762
    class BaseDataset(DatasetPropertyMixIn):
         """
         Base class for datasets in the openQDC package.
         """
    @@ -3063,7 +3198,13 @@ 

    return list(compress(self.energy_methods, self.force_mask)) @property - def e0s_dispatcher(self): + def e0s_dispatcher(self) -> AtomEnergies: + """ + Property to get the object that dispatched the isolated atom energies of the QM methods. + + Returns: + Object wrapping the isolated atom energies of the QM methods. + """ if not hasattr(self, "_e0s_dispatcher"): # Automatically fetch/compute formation or regression energies self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs) @@ -3618,6 +3759,50 @@

    + e0s_dispatcher: AtomEnergies + + + property + + +

    + + +
    + +

    Property to get the object that dispatched the isolated atom energies of the QM methods.

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + AtomEnergies + +
    +

    Object wrapping the isolated atom energies of the QM methods.

    +
    +
    +
    + +
    + +
    + + +

    energy_methods: List[str] @@ -3936,11 +4121,11 @@

    Source code in openqdc/datasets/base.py -
    713
    -714
    -715
    -716
    -717
    def __smiles_converter__(self, x):
    +              
    719
    +720
    +721
    +722
    +723
    def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
         encoded in a different format than its display format
         """
    @@ -4032,13 +4217,7 @@ 

    Source code in openqdc/datasets/base.py -
    645
    -646
    -647
    -648
    -649
    -650
    -651
    +              
    651
     652
     653
     654
    @@ -4049,7 +4228,13 @@ 

    659 660 661 -662

    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:
    +662
    +663
    +664
    +665
    +666
    +667
    +668
    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:
         """
         Return the dataset as an iterator.
     
    @@ -4202,13 +4387,7 @@ 

    Source code in openqdc/datasets/base.py -
    594
    -595
    -596
    -597
    -598
    -599
    -600
    +              
    600
     601
     602
     603
    @@ -4251,7 +4430,13 @@ 

    640 641 642 -643

    @requires_package("datamol")
    +643
    +644
    +645
    +646
    +647
    +648
    +649
    @requires_package("datamol")
     def calculate_descriptors(
         self,
         descriptor_name: str = "soap",
    @@ -4374,13 +4559,7 @@ 

    Source code in openqdc/datasets/base.py -
    383
    -384
    -385
    -386
    -387
    -388
    -389
    +              
    389
     390
     391
     392
    @@ -4393,7 +4572,13 @@ 

    399 400 401 -402

    def collate_list(self, list_entries: List[Dict]) -> Dict:
    +402
    +403
    +404
    +405
    +406
    +407
    +408
    def collate_list(self, list_entries: List[Dict]) -> Dict:
         """
         Collate a list of entries into a single dictionary.
     
    @@ -4514,13 +4699,7 @@ 

    Source code in openqdc/datasets/base.py -
    560
    -561
    -562
    -563
    -564
    -565
    -566
    +              
    566
     567
     568
     569
    @@ -4531,7 +4710,13 @@ 

    574 575 576 -577

    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:
    +577
    +578
    +579
    +580
    +581
    +582
    +583
    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:
         """
         Get the ASE atoms object for the entry at index idx.
     
    @@ -4622,13 +4807,7 @@ 

    Source code in openqdc/datasets/base.py -
    668
    -669
    -670
    -671
    -672
    -673
    -674
    +              
    674
     675
     676
     677
    @@ -4656,7 +4835,13 @@ 

    699 700 701 -702

    def get_statistics(self, return_none: bool = True) -> Dict:
    +702
    +703
    +704
    +705
    +706
    +707
    +708
    def get_statistics(self, return_none: bool = True) -> Dict:
         """
         Get the converted statistics of the dataset.
     
    @@ -4735,19 +4920,19 @@ 

    Source code in openqdc/datasets/base.py -
    474
    -475
    -476
    -477
    -478
    -479
    -480
    +              
    480
     481
     482
     483
     484
     485
    -486
    def is_cached(self) -> bool:
    +486
    +487
    +488
    +489
    +490
    +491
    +492
    def is_cached(self) -> bool:
         """
         Check if the dataset is cached locally.
     
    @@ -4804,19 +4989,19 @@ 

    Source code in openqdc/datasets/base.py -
    460
    -461
    -462
    -463
    -464
    -465
    -466
    +              
    466
     467
     468
     469
     470
     471
    -472
    def is_preprocessed(self) -> bool:
    +472
    +473
    +474
    +475
    +476
    +477
    +478
    def is_preprocessed(self) -> bool:
         """
         Check if the dataset is preprocessed and available online or locally.
     
    @@ -4947,13 +5132,7 @@ 

    Source code in openqdc/datasets/base.py -
    488
    -489
    -490
    -491
    -492
    -493
    -494
    +              
    494
     495
     496
     497
    @@ -4963,7 +5142,13 @@ 

    501 502 503 -504

    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):
    +504
    +505
    +506
    +507
    +508
    +509
    +510
    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):
         """
         Preprocess the dataset and save it.
     
    @@ -5001,11 +5186,11 @@ 

    Source code in openqdc/datasets/base.py -
    377
    -378
    -379
    -380
    -381
    def read_raw_entries(self):
    +              
    383
    +384
    +385
    +386
    +387
    def read_raw_entries(self):
         """
         Preprocess the raw (aka from the fetched source) into a list of dictionaries.
         """
    @@ -5089,13 +5274,7 @@ 

    Source code in openqdc/datasets/base.py -
    404
    -405
    -406
    -407
    -408
    -409
    -410
    +              
    410
     411
     412
     413
    @@ -5111,7 +5290,13 @@ 

    423 424 425 -426

    def save_preprocess(
    +426
    +427
    +428
    +429
    +430
    +431
    +432
    def save_preprocess(
         self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False
     ):
         """
    @@ -5226,13 +5411,7 @@ 

    Source code in openqdc/datasets/base.py -
    523
    -524
    -525
    -526
    -527
    -528
    -529
    +              
    529
     530
     531
     532
    @@ -5243,7 +5422,13 @@ 

    537 538 539 -540

    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):
    +540
    +541
    +542
    +543
    +544
    +545
    +546
    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):
         """
         Save a single entry at index idx as an extxyz file.
     
    @@ -5311,18 +5496,18 @@ 

    Source code in openqdc/datasets/base.py -
    360
    -361
    -362
    -363
    -364
    -365
    -366
    +              
    366
     367
     368
     369
     370
    -371
    def set_distance_unit(self, value: str):
    +371
    +372
    +373
    +374
    +375
    +376
    +377
    def set_distance_unit(self, value: str):
         """
         Set a new distance unit for the dataset.
     
    @@ -5384,18 +5569,18 @@ 

    Source code in openqdc/datasets/base.py -
    347
    -348
    -349
    -350
    -351
    -352
    -353
    +              
    353
     354
     355
     356
     357
    -358
    def set_energy_unit(self, value: str):
    +358
    +359
    +360
    +361
    +362
    +363
    +364
    def set_energy_unit(self, value: str):
         """
         Set a new energy unit for the dataset.
     
    @@ -5471,13 +5656,7 @@ 

    Source code in openqdc/datasets/base.py -
    542
    -543
    -544
    -545
    -546
    -547
    -548
    +              
    548
     549
     550
     551
    @@ -5487,7 +5666,13 @@ 

    555 556 557 -558

    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):
    +558
    +559
    +560
    +561
    +562
    +563
    +564
    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):
         """
         Save dataset as single xyz file (extended xyz format).
     
    @@ -5569,13 +5754,7 @@ 

    Source code in openqdc/datasets/base.py -
    506
    -507
    -508
    -509
    -510
    -511
    -512
    +              
    512
     513
     514
     515
    @@ -5584,7 +5763,13 @@ 

    518 519 520 -521

    def upload(self, overwrite: bool = False, as_zarr: bool = False):
    +521
    +522
    +523
    +524
    +525
    +526
    +527
    def upload(self, overwrite: bool = False, as_zarr: bool = False):
         """
         Upload the preprocessed data to the remote storage. Must be called after preprocess and
         need to have write privileges.
    diff --git a/stable/API/datasets/alchemy.html b/stable/API/datasets/alchemy.html
    index 0eeab5c..1c288d7 100644
    --- a/stable/API/datasets/alchemy.html
    +++ b/stable/API/datasets/alchemy.html
    @@ -712,12 +712,50 @@
       
       
       
    +    
    +    
    +    
    +      
    +      
    +    
    +    
    +    
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/ani.html b/stable/API/datasets/ani.html index 6d8acaa..8687ed9 100644 --- a/stable/API/datasets/ani.html +++ b/stable/API/datasets/ani.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/comp6.html b/stable/API/datasets/comp6.html index 29b654c..d6e44fd 100644 --- a/stable/API/datasets/comp6.html +++ b/stable/API/datasets/comp6.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/des.html b/stable/API/datasets/des.html index e26c89e..56b6223 100644 --- a/stable/API/datasets/des.html +++ b/stable/API/datasets/des.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/gdml.html b/stable/API/datasets/gdml.html index c60391a..b72b9c3 100644 --- a/stable/API/datasets/gdml.html +++ b/stable/API/datasets/gdml.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/geom.html b/stable/API/datasets/geom.html index 149ebf0..df6d8d7 100644 --- a/stable/API/datasets/geom.html +++ b/stable/API/datasets/geom.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/iso_17.html b/stable/API/datasets/iso_17.html index efcb509..eb44cd4 100644 --- a/stable/API/datasets/iso_17.html +++ b/stable/API/datasets/iso_17.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/l7.html b/stable/API/datasets/l7.html index 64c5313..19d4cb6 100644 --- a/stable/API/datasets/l7.html +++ b/stable/API/datasets/l7.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/md22.html b/stable/API/datasets/md22.html index 1f4ba22..6ce5eab 100644 --- a/stable/API/datasets/md22.html +++ b/stable/API/datasets/md22.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/metcalf.html b/stable/API/datasets/metcalf.html index 417176e..f10a98d 100644 --- a/stable/API/datasets/metcalf.html +++ b/stable/API/datasets/metcalf.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/molecule3d.html b/stable/API/datasets/molecule3d.html index 05c4954..ad03706 100644 --- a/stable/API/datasets/molecule3d.html +++ b/stable/API/datasets/molecule3d.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/multixcqm9.html b/stable/API/datasets/multixcqm9.html index adbccab..51c872a 100644 --- a/stable/API/datasets/multixcqm9.html +++ b/stable/API/datasets/multixcqm9.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/nabladft.html b/stable/API/datasets/nabladft.html index ed9bda2..ee3af62 100644 --- a/stable/API/datasets/nabladft.html +++ b/stable/API/datasets/nabladft.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/orbnet_denali.html b/stable/API/datasets/orbnet_denali.html index 6b49bb1..bdc4573 100644 --- a/stable/API/datasets/orbnet_denali.html +++ b/stable/API/datasets/orbnet_denali.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/pcqm.html b/stable/API/datasets/pcqm.html index 84dfdb7..f75e9a5 100644 --- a/stable/API/datasets/pcqm.html +++ b/stable/API/datasets/pcqm.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/proteinfragments.html b/stable/API/datasets/proteinfragments.html index 6071a61..1ccead1 100644 --- a/stable/API/datasets/proteinfragments.html +++ b/stable/API/datasets/proteinfragments.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/qm1b.html b/stable/API/datasets/qm1b.html index dd14218..0cd5626 100644 --- a/stable/API/datasets/qm1b.html +++ b/stable/API/datasets/qm1b.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/qm7x.html b/stable/API/datasets/qm7x.html index 9b963fc..f11df61 100644 --- a/stable/API/datasets/qm7x.html +++ b/stable/API/datasets/qm7x.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/qmugs.html b/stable/API/datasets/qmugs.html index 7d0caf2..a611019 100644 --- a/stable/API/datasets/qmugs.html +++ b/stable/API/datasets/qmugs.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/qmx.html b/stable/API/datasets/qmx.html index 9606b6d..bca379f 100644 --- a/stable/API/datasets/qmx.html +++ b/stable/API/datasets/qmx.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/revmd17.html b/stable/API/datasets/revmd17.html index 6f784ba..0b11751 100644 --- a/stable/API/datasets/revmd17.html +++ b/stable/API/datasets/revmd17.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/sn2_rxn.html b/stable/API/datasets/sn2_rxn.html index 181b444..5812f63 100644 --- a/stable/API/datasets/sn2_rxn.html +++ b/stable/API/datasets/sn2_rxn.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/solvated_peptides.html b/stable/API/datasets/solvated_peptides.html index c1385c4..7c75c4e 100644 --- a/stable/API/datasets/solvated_peptides.html +++ b/stable/API/datasets/solvated_peptides.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/spice.html b/stable/API/datasets/spice.html index 0567af0..e1e389e 100644 --- a/stable/API/datasets/spice.html +++ b/stable/API/datasets/spice.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/splinter.html b/stable/API/datasets/splinter.html index 9fb1b6d..fa6ae7e 100644 --- a/stable/API/datasets/splinter.html +++ b/stable/API/datasets/splinter.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/tmqm.html b/stable/API/datasets/tmqm.html index e16196f..8352705 100644 --- a/stable/API/datasets/tmqm.html +++ b/stable/API/datasets/tmqm.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/transition1x.html b/stable/API/datasets/transition1x.html index 799b6d3..b5de0a6 100644 --- a/stable/API/datasets/transition1x.html +++ b/stable/API/datasets/transition1x.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/vqm24.html b/stable/API/datasets/vqm24.html index 6d33c78..39c9f70 100644 --- a/stable/API/datasets/vqm24.html +++ b/stable/API/datasets/vqm24.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/waterclusters.html b/stable/API/datasets/waterclusters.html index f518110..3c8218a 100644 --- a/stable/API/datasets/waterclusters.html +++ b/stable/API/datasets/waterclusters.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/waterclusters3_30.html b/stable/API/datasets/waterclusters3_30.html index 3a4bb0a..b434c5d 100644 --- a/stable/API/datasets/waterclusters3_30.html +++ b/stable/API/datasets/waterclusters3_30.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/datasets/x40.html b/stable/API/datasets/x40.html index 103ff57..7606522 100644 --- a/stable/API/datasets/x40.html +++ b/stable/API/datasets/x40.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/e0_dispatcher.html b/stable/API/e0_dispatcher.html new file mode 100644 index 0000000..626a154 --- /dev/null +++ b/stable/API/e0_dispatcher.html @@ -0,0 +1,3955 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + e0 Dispatcher - OpenQDC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    e0 Dispatcher

    + +
    + + + + +
    + + + +
    + + + + + + + + +
    + + + +

    + AtomEnergies + + +

    + + +
    + + +

    Manager class for interface with the isolated atom energies classes +and providing the generals function to retrieve the data

    + +
    + Source code in openqdc/datasets/energies.py +
     98
    + 99
    +100
    +101
    +102
    +103
    +104
    +105
    +106
    +107
    +108
    +109
    +110
    +111
    +112
    +113
    +114
    +115
    +116
    +117
    +118
    +119
    +120
    +121
    +122
    +123
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    +131
    +132
    +133
    +134
    +135
    +136
    +137
    +138
    +139
    +140
    +141
    +142
    +143
    +144
    +145
    +146
    +147
    +148
    +149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    +157
    +158
    +159
    +160
    +161
    +162
    +163
    class AtomEnergies:
    +    """
    +    Manager class for interface with the isolated atom energies classes
    +    and providing the generals function to retrieve the data
    +    """
    +
    +    def __init__(self, data, **kwargs) -> None:
    +        self.atom_energies = data.energy_type
    +        self.factory = dispatch_factory(data, **kwargs)
    +
    +    @property
    +    def e0s_matrix(self) -> np.ndarray:
    +        """
    +        Return the isolated atom energies dictionary
    +
    +        Returns:
    +            Matrix Array with the isolated atom energies
    +        """
    +        return self.factory.e0_matrix
    +
    +    @property
    +    def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:
    +        """
    +        Return the isolated atom energies dictionary
    +
    +        Returns:
    +            Dictionary with the isolated atom energies
    +        """
    +        return self.factory.e0_dict
    +
    +    def __str__(self):
    +        return f"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}"
    +
    +    def __repr__(self):
    +        return str(self)
    +
    +    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:
    +        """
    +        Retrieve a key from the isolated atom dictionary.
    +        Item can be written as tuple(Symbol, charge),
    +        tuple(Chemical number, charge). If no charge is passed,
    +        it will be automatically set to 0.
    +
    +        Examples:
    +            AtomEnergies[6], AtomEnergies[6,1], \n
    +            AtomEnergies["C",1], AtomEnergies[(6,1)], \n
    +            AtomEnergies[("C,1)]
    +
    +        Parameters:
    +            item:
    +                AtomSpecies object or tuple with the atom symbol and charge
    +
    +        Returns:
    +            AtomEnergy object with the isolated atom energy
    +        """
    +        try:
    +            atom, charge = item[0], item[1]
    +        except TypeError:
    +            atom = item
    +            charge = 0
    +        except IndexError:
    +            atom = item[0]
    +            charge = 0
    +        if not isinstance(atom, str):
    +            atom = ATOM_SYMBOLS[atom]
    +        return self.e0s_dict[(atom, charge)]
    +
    +
    + + + +
    + + + + + + + +
    + + + +

    + e0s_dict: Dict[AtomSpecies, AtomEnergy] + + + property + + +

    + + +
    + +

    Return the isolated atom energies dictionary

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Dict[AtomSpecies, AtomEnergy] + +
    +

    Dictionary with the isolated atom energies

    +
    +
    +
    + +
    + +
    + + + +

    + e0s_matrix: np.ndarray + + + property + + +

    + + +
    + +

    Return the isolated atom energies dictionary

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + ndarray + +
    +

    Matrix Array with the isolated atom energies

    +
    +
    +
    + +
    + + + +
    + + +

    + __getitem__(item) + +

    + + +
    + +

    Retrieve a key from the isolated atom dictionary. +Item can be written as tuple(Symbol, charge), +tuple(Chemical number, charge). If no charge is passed, +it will be automatically set to 0.

    + + +

    Examples:

    +

    AtomEnergies[6], AtomEnergies[6,1],

    +

    AtomEnergies["C",1], AtomEnergies[(6,1)],

    +

    AtomEnergies[("C,1)]

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    item + AtomSpecies + +
    +

    AtomSpecies object or tuple with the atom symbol and charge

    +
    +
    + required +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + AtomEnergy + +
    +

    AtomEnergy object with the isolated atom energy

    +
    +
    + +
    + Source code in openqdc/datasets/energies.py +
    134
    +135
    +136
    +137
    +138
    +139
    +140
    +141
    +142
    +143
    +144
    +145
    +146
    +147
    +148
    +149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    +157
    +158
    +159
    +160
    +161
    +162
    +163
    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:
    +    """
    +    Retrieve a key from the isolated atom dictionary.
    +    Item can be written as tuple(Symbol, charge),
    +    tuple(Chemical number, charge). If no charge is passed,
    +    it will be automatically set to 0.
    +
    +    Examples:
    +        AtomEnergies[6], AtomEnergies[6,1], \n
    +        AtomEnergies["C",1], AtomEnergies[(6,1)], \n
    +        AtomEnergies[("C,1)]
    +
    +    Parameters:
    +        item:
    +            AtomSpecies object or tuple with the atom symbol and charge
    +
    +    Returns:
    +        AtomEnergy object with the isolated atom energy
    +    """
    +    try:
    +        atom, charge = item[0], item[1]
    +    except TypeError:
    +        atom = item
    +        charge = 0
    +    except IndexError:
    +        atom = item[0]
    +        charge = 0
    +    if not isinstance(atom, str):
    +        atom = ATOM_SYMBOLS[atom]
    +    return self.e0s_dict[(atom, charge)]
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + +
    + + + +

    + AtomEnergy + + + + dataclass + + +

    + + +
    + + +

    Datastructure to store isolated atom energies +and the std deviation associated to the value. +By default the std will be 1 if no value was calculated +or not available (formation energy case)

    + +
    + Source code in openqdc/datasets/energies.py +
    74
    +75
    +76
    +77
    +78
    +79
    +80
    +81
    +82
    +83
    +84
    +85
    +86
    +87
    +88
    +89
    +90
    +91
    +92
    +93
    +94
    +95
    @dataclass
    +class AtomEnergy:
    +    """
    +    Datastructure to store isolated atom energies
    +    and the std deviation associated to the value.
    +    By default the std will be 1 if no value was calculated
    +    or not available (formation energy case)
    +    """
    +
    +    mean: np.array
    +    std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32))
    +
    +    def __post_init__(self):
    +        if not isinstance(self.mean, np.ndarray):
    +            self.mean = np.array([self.mean], dtype=np.float32)
    +
    +    def append(self, other: "AtomEnergy"):
    +        """
    +        Append the mean and std of another atom energy
    +        """
    +        self.mean = np.append(self.mean, other.mean)
    +        self.std = np.append(self.std, other.std)
    +
    +
    + + + +
    + + + + + + + + + +
    + + +

    + append(other) + +

    + + +
    + +

    Append the mean and std of another atom energy

    + +
    + Source code in openqdc/datasets/energies.py +
    90
    +91
    +92
    +93
    +94
    +95
    def append(self, other: "AtomEnergy"):
    +    """
    +    Append the mean and std of another atom energy
    +    """
    +    self.mean = np.append(self.mean, other.mean)
    +    self.std = np.append(self.std, other.std)
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + +
    + + + +

    + AtomSpecies + + + + dataclass + + +

    + + +
    + + +

    Structure that defines a tuple of chemical specie and charge +and provide hash and automatic conversion from atom number to +checmical symbol

    + +
    + Source code in openqdc/datasets/energies.py +
    48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    +58
    +59
    +60
    +61
    +62
    +63
    +64
    +65
    +66
    +67
    +68
    +69
    +70
    +71
    @dataclass(frozen=False, eq=True)
    +class AtomSpecies:
    +    """
    +    Structure that defines a tuple of chemical specie and charge
    +    and provide hash and automatic conversion from atom number to
    +    checmical symbol
    +    """
    +
    +    symbol: Union[str, int]
    +    charge: int = 0
    +
    +    def __post_init__(self):
    +        if not isinstance(self.symbol, str):
    +            self.symbol = ATOM_SYMBOLS[self.symbol]
    +        self.number = ATOMIC_NUMBERS[self.symbol]
    +
    +    def __hash__(self):
    +        return hash((self.symbol, self.charge))
    +
    +    def __eq__(self, other):
    +        if not isinstance(other, AtomSpecies):
    +            symbol, charge = other[0], other[1]
    +            other = AtomSpecies(symbol=symbol, charge=charge)
    +        return (self.number, self.charge) == (other.number, other.charge)
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + IsolatedEnergyInterface + + +

    + + +
    +

    + Bases: ABC

    + + +

    Abstract class that defines the interface for the +different implementation of an isolated atom energy value

    + +
    + Source code in openqdc/datasets/energies.py +
    166
    +167
    +168
    +169
    +170
    +171
    +172
    +173
    +174
    +175
    +176
    +177
    +178
    +179
    +180
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    +188
    +189
    +190
    +191
    +192
    +193
    +194
    +195
    +196
    +197
    +198
    +199
    +200
    +201
    +202
    +203
    +204
    +205
    +206
    +207
    +208
    +209
    +210
    +211
    +212
    +213
    +214
    +215
    +216
    +217
    +218
    +219
    +220
    +221
    +222
    +223
    +224
    +225
    +226
    +227
    class IsolatedEnergyInterface(ABC):
    +    """
    +    Abstract class that defines the interface for the
    +    different implementation of an isolated atom energy value
    +    """
    +
    +    def __init__(self, data, **kwargs):
    +        """
    +        Parameters:
    +            data : openqdc.datasets.Dataset
    +                Dataset object that contains the information
    +                about the isolated atom energies. Info will be passed
    +                by references
    +            kwargs : dict
    +                Additional arguments that will be passed to the
    +                selected energy class. Mostly used for regression
    +                to pass the regressor_kwargs.
    +        """
    +        self._e0_matrixs = []
    +        self._e0_dict = None
    +        self.kwargs = kwargs
    +        self.data = data
    +        self._post_init()
    +
    +    @property
    +    def refit(self) -> bool:
    +        return self.data.refit_e0s
    +
    +    @abstractmethod
    +    def _post_init(self):
    +        """
    +        Main method to fetch/compute/recomputed the isolated atom energies.
    +        Need to be implemented in all child classes.
    +        """
    +        pass
    +
    +    def __len__(self):
    +        return len(self.data.energy_methods)
    +
    +    @property
    +    def e0_matrix(self) -> np.ndarray:
    +        """
    +        Return the isolated atom energies matrixes
    +
    +        Returns:
    +            Matrix Array with the isolated atom energies
    +        """
    +        return np.array(self._e0_matrixs)
    +
    +    @property
    +    def e0_dict(self) -> Dict:
    +        """
    +        Return the isolated atom energies dict
    +
    +        Returns:
    +            Dictionary with the isolated atom energies
    +        """
    +
    +        return self._e0s_dict
    +
    +    def __str__(self) -> str:
    +        return self.__class__.__name__.lower()
    +
    +
    + + + +
    + + + + + + + +
    + + + +

    + e0_dict: Dict + + + property + + +

    + + +
    + +

    Return the isolated atom energies dict

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Dict + +
    +

    Dictionary with the isolated atom energies

    +
    +
    +
    + +
    + +
    + + + +

    + e0_matrix: np.ndarray + + + property + + +

    + + +
    + +

    Return the isolated atom energies matrixes

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + ndarray + +
    +

    Matrix Array with the isolated atom energies

    +
    +
    +
    + +
    + + + +
    + + +

    + __init__(data, **kwargs) + +

    + + +
    + + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    data + +
    +

    openqdc.datasets.Dataset +Dataset object that contains the information +about the isolated atom energies. Info will be passed +by references

    +
    +
    + required +
    kwargs + +
    +

    dict +Additional arguments that will be passed to the +selected energy class. Mostly used for regression +to pass the regressor_kwargs.

    +
    +
    + {} +
    + +
    + Source code in openqdc/datasets/energies.py +
    172
    +173
    +174
    +175
    +176
    +177
    +178
    +179
    +180
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    +188
    def __init__(self, data, **kwargs):
    +    """
    +    Parameters:
    +        data : openqdc.datasets.Dataset
    +            Dataset object that contains the information
    +            about the isolated atom energies. Info will be passed
    +            by references
    +        kwargs : dict
    +            Additional arguments that will be passed to the
    +            selected energy class. Mostly used for regression
    +            to pass the regressor_kwargs.
    +    """
    +    self._e0_matrixs = []
    +    self._e0_dict = None
    +    self.kwargs = kwargs
    +    self.data = data
    +    self._post_init()
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + +
    + + + +

    + NullEnergy + + +

    + + +
    +

    + Bases: IsolatedEnergyInterface

    + + +

    Class that returns a null (zeros) matrix for the isolated atom energies in case +of no energies are available.

    + +
    + Source code in openqdc/datasets/energies.py +
    252
    +253
    +254
    +255
    +256
    +257
    +258
    +259
    +260
    +261
    +262
    +263
    +264
    +265
    +266
    +267
    +268
    +269
    +270
    +271
    +272
    class NullEnergy(IsolatedEnergyInterface):
    +    """
    +    Class that returns a null (zeros) matrix for the isolated atom energies in case
    +    of no energies are available.
    +    """
    +
    +    def _assembly_e0_dict(self):
    +        datum = {}
    +        for _ in self.data.__energy_methods__:
    +            for key, values in PotentialMethod.NONE.atom_energies_dict.items():
    +                atm = AtomSpecies(*key)
    +                ens = AtomEnergy(values)
    +                if atm not in datum:
    +                    datum[atm] = ens
    +                else:
    +                    datum[atm].append(ens)
    +        self._e0s_dict = datum
    +
    +    def _post_init(self):
    +        self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))]
    +        self._assembly_e0_dict()
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + PhysicalEnergy + + +

    + + +
    +

    + Bases: IsolatedEnergyInterface

    + + +

    Class that returns a physical (SE,DFT,etc) isolated atom energies.

    + +
    + Source code in openqdc/datasets/energies.py +
    230
    +231
    +232
    +233
    +234
    +235
    +236
    +237
    +238
    +239
    +240
    +241
    +242
    +243
    +244
    +245
    +246
    +247
    +248
    +249
    class PhysicalEnergy(IsolatedEnergyInterface):
    +    """
    +    Class that returns a physical (SE,DFT,etc) isolated atom energies.
    +    """
    +
    +    def _assembly_e0_dict(self):
    +        datum = {}
    +        for method in self.data.__energy_methods__:
    +            for key, values in method.atom_energies_dict.items():
    +                atm = AtomSpecies(*key)
    +                ens = AtomEnergy(values)
    +                if atm not in datum:
    +                    datum[atm] = ens
    +                else:
    +                    datum[atm].append(ens)
    +        self._e0s_dict = datum
    +
    +    def _post_init(self):
    +        self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__]
    +        self._assembly_e0_dict()
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + RegressionEnergy + + +

    + + +
    +

    + Bases: IsolatedEnergyInterface

    + + +

    Class that compute and returns the regressed isolated atom energies.

    + +
    + Source code in openqdc/datasets/energies.py +
    275
    +276
    +277
    +278
    +279
    +280
    +281
    +282
    +283
    +284
    +285
    +286
    +287
    +288
    +289
    +290
    +291
    +292
    +293
    +294
    +295
    +296
    +297
    +298
    +299
    +300
    +301
    +302
    +303
    +304
    +305
    +306
    +307
    +308
    +309
    +310
    +311
    +312
    +313
    +314
    +315
    +316
    +317
    +318
    +319
    +320
    +321
    +322
    +323
    +324
    +325
    +326
    +327
    +328
    +329
    +330
    +331
    +332
    +333
    +334
    +335
    +336
    +337
    +338
    +339
    +340
    +341
    +342
    +343
    +344
    +345
    +346
    +347
    +348
    +349
    +350
    +351
    +352
    +353
    +354
    +355
    class RegressionEnergy(IsolatedEnergyInterface):
    +    """
    +    Class that compute and returns the regressed isolated atom energies.
    +    """
    +
    +    def _post_init(self):
    +        if not self.attempt_load() or self.refit:
    +            self.regressor = Regressor.from_openqdc_dataset(self.data, **self.kwargs)
    +            E0s, cov = self._compute_regression_e0s()
    +            self._set_lin_atom_species_dict(E0s, cov)
    +        self._set_linear_e0s()
    +
    +    def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:
    +        """
    +        Try to compute the regressed isolated atom energies.
    +        raise an error if the regression fails.
    +        return the regressed isolated atom energies and the uncertainty values.
    +
    +        Returns:
    +            Tuple with the regressed isolated atom energies and the uncertainty values of the regression
    +            if available.
    +        """
    +        try:
    +            E0s, cov = self.regressor.solve()
    +        except np.linalg.LinAlgError:
    +            logger.warning(f"Failed to compute E0s using {self.regressor.solver_type} regression.")
    +            raise np.linalg.LinAlgError
    +        return E0s, cov
    +
    +    def _set_lin_atom_species_dict(self, E0s, covs) -> None:
    +        """
    +        Set the regressed isolated atom energies in a dictionary format
    +        and Save the values in a pickle file to easy loading.
    +        """
    +        atomic_energies_dict = {}
    +        for i, z in enumerate(self.regressor.numbers):
    +            for charge in range(-10, 11):
    +                atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i])
    +            # atomic_energies_dict[z] = E0s[i]
    +        self._e0s_dict = atomic_energies_dict
    +        self.save_e0s()
    +
    +    def _set_linear_e0s(self) -> None:
    +        """
    +        Transform the e0s dictionary into the correct e0s
    +        matrix format.
    +        """
    +        new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]
    +        for z, e0 in self._e0s_dict.items():
    +            for i in range(len(self)):
    +                # new_e0s[i][z, :] = e0[i]
    +                new_e0s[i][z.number, z.charge] = e0.mean[i]
    +            # for atom_sp, values in
    +        self._e0_matrixs = new_e0s
    +
    +    def save_e0s(self) -> None:
    +        """
    +        Save the regressed isolated atom energies in a pickle file.
    +        """
    +        save_pkl(self._e0s_dict, self.preprocess_path)
    +
    +    def attempt_load(self) -> bool:
    +        """
    +        Try to load the regressed isolated atom energies from the
    +        object pickle file and return the success of the operation.
    +        """
    +        try:
    +            self._e0s_dict = load_pkl(self.preprocess_path)
    +            logger.info(f"Found energy file for {str(self)}.")
    +            return True
    +        except FileNotFoundError:
    +            logger.warning(f"Energy file for {str(self)} not found.")
    +            return False
    +
    +    @property
    +    def preprocess_path(self):
    +        """
    +        Return the path to the object pickle file.
    +        """
    +        path = p_join(self.data.root, "preprocessed", str(self) + ".pkl")
    +        return path
    +
    +
    + + + +
    + + + + + + + +
    + + + +

    + preprocess_path + + + property + + +

    + + +
    + +

    Return the path to the object pickle file.

    +
    + +
    + + + +
    + + +

    + attempt_load() + +

    + + +
    + +

    Try to load the regressed isolated atom energies from the +object pickle file and return the success of the operation.

    + +
    + Source code in openqdc/datasets/energies.py +
    336
    +337
    +338
    +339
    +340
    +341
    +342
    +343
    +344
    +345
    +346
    +347
    def attempt_load(self) -> bool:
    +    """
    +    Try to load the regressed isolated atom energies from the
    +    object pickle file and return the success of the operation.
    +    """
    +    try:
    +        self._e0s_dict = load_pkl(self.preprocess_path)
    +        logger.info(f"Found energy file for {str(self)}.")
    +        return True
    +    except FileNotFoundError:
    +        logger.warning(f"Energy file for {str(self)} not found.")
    +        return False
    +
    +
    +
    + +
    + +
    + + +

    + save_e0s() + +

    + + +
    + +

    Save the regressed isolated atom energies in a pickle file.

    + +
    + Source code in openqdc/datasets/energies.py +
    330
    +331
    +332
    +333
    +334
    def save_e0s(self) -> None:
    +    """
    +    Save the regressed isolated atom energies in a pickle file.
    +    """
    +    save_pkl(self._e0s_dict, self.preprocess_path)
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + + +
    + + +

    + dispatch_factory(data, **kwargs) + +

    + + +
    + +

    Factory function that select the correct +energy class for the fetching/calculation +of isolated atom energies.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    data + +
    +

    openqdc.datasets.Dataset +Dataset object that contains the information +about the isolated atom energies. Info will be passed +by references

    +
    +
    + required +
    kwargs + +
    +

    dict +Additional arguments that will be passed to the +selected energy class. Mostly used for regression +to pass the regressor_kwargs.

    +
    +
    + {} +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + IsolatedEnergyInterface + +
    +

    Initialized IsolatedEnergyInterface-like object

    +
    +
    + +
    + Source code in openqdc/datasets/energies.py +
    17
    +18
    +19
    +20
    +21
    +22
    +23
    +24
    +25
    +26
    +27
    +28
    +29
    +30
    +31
    +32
    +33
    +34
    +35
    +36
    +37
    +38
    +39
    +40
    +41
    +42
    +43
    +44
    +45
    def dispatch_factory(data: Any, **kwargs: Dict) -> "IsolatedEnergyInterface":
    +    """
    +    Factory function that select the correct
    +    energy class for the fetching/calculation
    +    of isolated atom energies.
    +
    +    Parameters:
    +        data : openqdc.datasets.Dataset
    +            Dataset object that contains the information
    +            about the isolated atom energies. Info will be passed
    +            by references
    +        kwargs : dict
    +            Additional arguments that will be passed to the
    +            selected energy class. Mostly used for regression
    +            to pass the regressor_kwargs.
    +
    +    Returns:
    +        Initialized IsolatedEnergyInterface-like object
    +    """
    +    if data.energy_type == "formation":
    +        return PhysicalEnergy(data, **kwargs)
    +    elif data.energy_type == "regression":
    +        try:
    +            return RegressionEnergy(data, **kwargs)
    +        except np.linalg.LinAlgError:
    +            logger.warning("Error! Using physical energies instead.")
    +            return PhysicalEnergy(data, **kwargs)
    +    elif data.energy_type == "null":
    +        return NullEnergy(data, **kwargs)
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/stable/API/formats.html b/stable/API/formats.html index 3047042..b4e8e6e 100644 --- a/stable/API/formats.html +++ b/stable/API/formats.html @@ -13,7 +13,7 @@ - + @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + diff --git a/stable/API/methods.html b/stable/API/methods.html index ea05143..5ea5c05 100644 --- a/stable/API/methods.html +++ b/stable/API/methods.html @@ -842,12 +842,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/properties.html b/stable/API/properties.html new file mode 100644 index 0000000..1d6eb0b --- /dev/null +++ b/stable/API/properties.html @@ -0,0 +1,2546 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Available Properties - OpenQDC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    Defined properties for datasets

    + + +
    + + + + +
    + + + +
    + + + + + + + + +
    + + + +

    + DatasetPropertyMixIn + + +

    + + +
    + + +

    Mixin class for BaseDataset class to add +properties that are common to all datasets.

    + +
    + Source code in openqdc/datasets/properties.py +
    10
    +11
    +12
    +13
    +14
    +15
    +16
    +17
    +18
    +19
    +20
    +21
    +22
    +23
    +24
    +25
    +26
    +27
    +28
    +29
    +30
    +31
    +32
    +33
    +34
    +35
    +36
    +37
    +38
    +39
    +40
    +41
    +42
    +43
    +44
    +45
    +46
    +47
    +48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    +58
    +59
    +60
    +61
    +62
    +63
    +64
    +65
    +66
    +67
    +68
    +69
    +70
    +71
    +72
    +73
    +74
    +75
    +76
    +77
    +78
    +79
    +80
    +81
    +82
    +83
    +84
    +85
    +86
    +87
    +88
    +89
    +90
    +91
    +92
    class DatasetPropertyMixIn:
    +    """
    +    Mixin class for BaseDataset class to add
    +    properties that are common to all datasets.
    +    """
    +
    +    @property
    +    def atoms_per_molecules(self):
    +        try:
    +            if hasattr(self, "_n_atoms"):
    +                return self._n_atoms
    +            self._n_atoms = self.data["n_atoms"]
    +            return self._n_atoms
    +        except:  # noqa
    +            return None
    +
    +    @property
    +    def _stats(self):
    +        return self.__stats__
    +
    +    def _compute_average_nb_atoms(self):
    +        self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
    +
    +    @property
    +    def average_n_atoms(self) -> int:
    +        """
    +        Average number of atoms in a molecule in the dataset.
    +
    +        Returns:
    +            Average number of atoms in a molecule in the dataset.
    +        """
    +        if self.__average_nb_atoms__ is None:
    +            raise StatisticsNotAvailableError(self.__name__)
    +        return self.__average_nb_atoms__
    +
    +    @property
    +    def numbers(self) -> np.ndarray:
    +        """
    +        Unique atomic numbers in the dataset
    +
    +        Returns:
    +            Array of the unique atomic numbers in the dataset
    +        """
    +        if hasattr(self, "_numbers"):
    +            return self._numbers
    +        self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
    +        return self._numbers
    +
    +    @property
    +    def charges(self) -> np.ndarray:
    +        """
    +        Unique charges in the dataset
    +
    +        Returns:
    +            Array of the unique charges in the dataset
    +        """
    +        if hasattr(self, "_charges"):
    +            return self._charges
    +        self._charges = np.unique(self.data["atomic_inputs"][..., :2], axis=0).astype(np.int32)
    +        return self._charges
    +
    +    @property
    +    def min_max_charges(self) -> Tuple[int, int]:
    +        """
    +        Minimum and maximum charges in the dataset
    +
    +        Returns:
    +            (min_charge, max_charge)
    +        """
    +        if hasattr(self, "_min_max_charges"):
    +            return self._min_max_charges
    +        self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])
    +        return self._min_max_charges
    +
    +    @property
    +    def chemical_species(self) -> np.ndarray:
    +        """
    +        Chemical symbols in the dataset
    +
    +        Returns:
    +            Array of the chemical symbols in the dataset
    +        """
    +        return np.array(ATOM_SYMBOLS)[self.numbers]
    +
    +
    + + + +
    + + + + + + + +
    + + + +

    + average_n_atoms: int + + + property + + +

    + + +
    + +

    Average number of atoms in a molecule in the dataset.

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + int + +
    +

    Average number of atoms in a molecule in the dataset.

    +
    +
    +
    + +
    + +
    + + + +

    + charges: np.ndarray + + + property + + +

    + + +
    + +

    Unique charges in the dataset

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + ndarray + +
    +

    Array of the unique charges in the dataset

    +
    +
    +
    + +
    + +
    + + + +

    + chemical_species: np.ndarray + + + property + + +

    + + +
    + +

    Chemical symbols in the dataset

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + ndarray + +
    +

    Array of the chemical symbols in the dataset

    +
    +
    +
    + +
    + +
    + + + +

    + min_max_charges: Tuple[int, int] + + + property + + +

    + + +
    + +

    Minimum and maximum charges in the dataset

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Tuple[int, int] + +
    +

    (min_charge, max_charge)

    +
    +
    +
    + +
    + +
    + + + +

    + numbers: np.ndarray + + + property + + +

    + + +
    + +

    Unique atomic numbers in the dataset

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + ndarray + +
    +

    Array of the unique atomic numbers in the dataset

    +
    +
    +
    + +
    + + + + + +
    + +
    + +
    + + + + +
    + +
    + +
    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/stable/API/regressor.html b/stable/API/regressor.html index d53ea44..48cb6f5 100644 --- a/stable/API/regressor.html +++ b/stable/API/regressor.html @@ -859,12 +859,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/statistics.html b/stable/API/statistics.html new file mode 100644 index 0000000..46996de --- /dev/null +++ b/stable/API/statistics.html @@ -0,0 +1,4930 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Statistics - OpenQDC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + + + + + +
    +
    + + + + + + + +

    Statistics

    + +
    + + + + +
    + + + +
    + + + + + + + + +
    + + + +

    + AbstractStatsCalculator + + +

    + + +
    +

    + Bases: ABC

    + + +

    Abstract class that defines the interface for all +the calculators object and the methods to +compute the statistics.

    + +
    + Source code in openqdc/datasets/statistics.py +
    159
    +160
    +161
    +162
    +163
    +164
    +165
    +166
    +167
    +168
    +169
    +170
    +171
    +172
    +173
    +174
    +175
    +176
    +177
    +178
    +179
    +180
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    +188
    +189
    +190
    +191
    +192
    +193
    +194
    +195
    +196
    +197
    +198
    +199
    +200
    +201
    +202
    +203
    +204
    +205
    +206
    +207
    +208
    +209
    +210
    +211
    +212
    +213
    +214
    +215
    +216
    +217
    +218
    +219
    +220
    +221
    +222
    +223
    +224
    +225
    +226
    +227
    +228
    +229
    +230
    +231
    +232
    +233
    +234
    +235
    +236
    +237
    +238
    +239
    +240
    +241
    +242
    +243
    +244
    +245
    +246
    +247
    +248
    +249
    +250
    +251
    +252
    +253
    +254
    +255
    +256
    +257
    +258
    +259
    +260
    +261
    +262
    +263
    +264
    +265
    +266
    +267
    +268
    +269
    +270
    +271
    +272
    +273
    +274
    +275
    +276
    +277
    +278
    +279
    +280
    +281
    +282
    +283
    +284
    +285
    +286
    +287
    +288
    +289
    +290
    +291
    +292
    +293
    +294
    +295
    +296
    +297
    +298
    +299
    +300
    +301
    +302
    +303
    +304
    +305
    +306
    +307
    +308
    +309
    +310
    +311
    +312
    +313
    +314
    +315
    +316
    +317
    +318
    +319
    +320
    +321
    class AbstractStatsCalculator(ABC):
    +    """
    +    Abstract class that defines the interface for all
    +    the calculators object and the methods to
    +    compute the statistics.
    +    """
    +
    +    # State Dependencies of the calculator to skip part of the calculation
    +    state_dependency = []
    +    name = None
    +
    +    def __init__(
    +        self,
    +        name: str,
    +        energy_type: Optional[str] = None,
    +        force_recompute: bool = False,
    +        energies: Optional[np.ndarray] = None,
    +        n_atoms: Optional[np.ndarray] = None,
    +        atom_species: Optional[np.ndarray] = None,
    +        position_idx_range: Optional[np.ndarray] = None,
    +        e0_matrix: Optional[np.ndarray] = None,
    +        atom_charges: Optional[np.ndarray] = None,
    +        forces: Optional[np.ndarray] = None,
    +    ):
    +        """
    +        Parameters:
    +            name :
    +                Name of the dataset for saving and loading.
    +            energy_type :
    +                Type of the energy for the computation of the statistics. Used for loading and saving.
    +            force_recompute :
    +                Flag to force the recomputation of the statistics
    +            energies : n
    +                Energies of the dataset
    +            n_atoms :
    +                Number of atoms in the dataset
    +            atom_species :
    +                Atomic species of the dataset
    +            position_idx_range : n
    +                Position index range of the dataset
    +            e0_matrix :
    +                Isolated atom energies matrix of the dataset
    +            atom_charges :
    +                Atomic charges of the dataset
    +            forces :
    +                Forces of the dataset
    +        """
    +        self.name = name
    +        self.energy_type = energy_type
    +        self.force_recompute = force_recompute
    +        self.energies = energies
    +        self.forces = forces
    +        self.position_idx_range = position_idx_range
    +        self.e0_matrix = e0_matrix
    +        self.n_atoms = n_atoms
    +        self.atom_species_charges_tuple = (atom_species, atom_charges)
    +        self._root = p_join(get_local_cache(), self.name)
    +        if atom_species is not None and atom_charges is not None:
    +            # by value not reference
    +            self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)
    +
    +    @property
    +    def has_forces(self) -> bool:
    +        return self.forces is not None
    +
    +    @property
    +    def preprocess_path(self):
    +        path = p_join(self.root, "statistics", self.name + f"_{str(self)}" + ".pkl")
    +        return path
    +
    +    @property
    +    def root(self):
    +        """
    +        Path to the dataset folder
    +        """
    +        return self._root
    +
    +    @classmethod
    +    def from_openqdc_dataset(cls, dataset, recompute: bool = False):
    +        """
    +        Create a calculator object from a dataset object.
    +        """
    +        obj = cls(
    +            name=dataset.__name__,
    +            force_recompute=recompute,
    +            energy_type=dataset.energy_type,
    +            energies=dataset.data["energies"],
    +            forces=dataset.data["forces"] if "forces" in dataset.data else None,
    +            n_atoms=dataset.data["n_atoms"],
    +            position_idx_range=dataset.data["position_idx_range"],
    +            atom_species=dataset.data["atomic_inputs"][:, 0].ravel(),
    +            atom_charges=dataset.data["atomic_inputs"][:, 1].ravel(),
    +            e0_matrix=dataset.__isolated_atom_energies__,
    +        )
    +        obj._root = dataset.root  # set to the dataset root in case of multiple datasets
    +        return obj
    +
    +    @abstractmethod
    +    def compute(self) -> StatisticsResults:
    +        """
    +        Abstract method to compute the statistics.
    +        Must return a StatisticsResults object and be implemented
    +        in all the childs
    +        """
    +        raise NotImplementedError
    +
    +    def save_statistics(self) -> None:
    +        """
    +        Save statistics file to the dataset folder as a pkl file
    +        """
    +        save_pkl(self.result, self.preprocess_path)
    +
    +    def attempt_load(self) -> bool:
    +        """
    +        Load precomputed statistics file and return the success of the operation
    +        """
    +        try:
    +            self.result = load_pkl(self.preprocess_path)
    +            logger.info(f"Statistics for {str(self)} loaded successfully")
    +            return True
    +        except FileNotFoundError:
    +            logger.warning(f"Statistics for {str(self)} not found. Computing...")
    +            return False
    +
    +    def _setup_deps(self, state: Dict) -> None:
    +        """
    +        Check if the dependencies of calculators are satisfied
    +        from the state object and set the attributes of the calculator
    +        to skip part of the calculation
    +        """
    +        self.state = state
    +        self.deps_satisfied = all([dep in state for dep in self.state_dependency])
    +        if self.deps_satisfied:
    +            for dep in self.state_dependency:
    +                setattr(self, dep, state[dep])
    +
    +    def write_state(self, update: Dict) -> None:
    +        """
    +        Write/update the state dictionary with the update dictionary
    +
    +        update:
    +            dictionary containing the update to the state
    +        """
    +        self.state.update(update)
    +
    +    def run(self, state: Dict) -> None:
    +        """
    +        Main method to run the calculator.
    +        Setup the dependencies from the state dictionary
    +        Check if the statistics are already computed and load them or
    +        recompute them
    +        Save the statistics in the correct folder
    +
    +        state:
    +            dictionary containing the state of the calculator
    +        """
    +        self._setup_deps(state)
    +        if self.force_recompute or not self.attempt_load():
    +            self.result = self.compute()
    +            self.save_statistics()
    +
    +    def __str__(self) -> str:
    +        return self.__class__.__name__.lower()
    +
    +
    + + + +
    + + + + + + + +
    + + + +

    + root + + + property + + +

    + + +
    + +

    Path to the dataset folder

    +
    + +
    + + + +
    + + +

    + __init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None) + +

    + + +
    + + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    name + +
    +

    Name of the dataset for saving and loading.

    +
    +
    + required +
    energy_type + +
    +

    Type of the energy for the computation of the statistics. Used for loading and saving.

    +
    +
    + None +
    force_recompute + +
    +

    Flag to force the recomputation of the statistics

    +
    +
    + False +
    energies + +
    +

    n +Energies of the dataset

    +
    +
    + None +
    n_atoms + +
    +

    Number of atoms in the dataset

    +
    +
    + None +
    atom_species + +
    +

    Atomic species of the dataset

    +
    +
    + None +
    position_idx_range + +
    +

    n +Position index range of the dataset

    +
    +
    + None +
    e0_matrix + +
    +

    Isolated atom energies matrix of the dataset

    +
    +
    + None +
    atom_charges + +
    +

    Atomic charges of the dataset

    +
    +
    + None +
    forces + +
    +

    Forces of the dataset

    +
    +
    + None +
    + +
    + Source code in openqdc/datasets/statistics.py +
    170
    +171
    +172
    +173
    +174
    +175
    +176
    +177
    +178
    +179
    +180
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    +188
    +189
    +190
    +191
    +192
    +193
    +194
    +195
    +196
    +197
    +198
    +199
    +200
    +201
    +202
    +203
    +204
    +205
    +206
    +207
    +208
    +209
    +210
    +211
    +212
    +213
    +214
    +215
    +216
    +217
    +218
    def __init__(
    +    self,
    +    name: str,
    +    energy_type: Optional[str] = None,
    +    force_recompute: bool = False,
    +    energies: Optional[np.ndarray] = None,
    +    n_atoms: Optional[np.ndarray] = None,
    +    atom_species: Optional[np.ndarray] = None,
    +    position_idx_range: Optional[np.ndarray] = None,
    +    e0_matrix: Optional[np.ndarray] = None,
    +    atom_charges: Optional[np.ndarray] = None,
    +    forces: Optional[np.ndarray] = None,
    +):
    +    """
    +    Parameters:
    +        name :
    +            Name of the dataset for saving and loading.
    +        energy_type :
    +            Type of the energy for the computation of the statistics. Used for loading and saving.
    +        force_recompute :
    +            Flag to force the recomputation of the statistics
    +        energies : n
    +            Energies of the dataset
    +        n_atoms :
    +            Number of atoms in the dataset
    +        atom_species :
    +            Atomic species of the dataset
    +        position_idx_range : n
    +            Position index range of the dataset
    +        e0_matrix :
    +            Isolated atom energies matrix of the dataset
    +        atom_charges :
    +            Atomic charges of the dataset
    +        forces :
    +            Forces of the dataset
    +    """
    +    self.name = name
    +    self.energy_type = energy_type
    +    self.force_recompute = force_recompute
    +    self.energies = energies
    +    self.forces = forces
    +    self.position_idx_range = position_idx_range
    +    self.e0_matrix = e0_matrix
    +    self.n_atoms = n_atoms
    +    self.atom_species_charges_tuple = (atom_species, atom_charges)
    +    self._root = p_join(get_local_cache(), self.name)
    +    if atom_species is not None and atom_charges is not None:
    +        # by value not reference
    +        self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)
    +
    +
    +
    + +
    + +
    + + +

    + attempt_load() + +

    + + +
    + +

    Load precomputed statistics file and return the success of the operation

    + +
    + Source code in openqdc/datasets/statistics.py +
    271
    +272
    +273
    +274
    +275
    +276
    +277
    +278
    +279
    +280
    +281
    def attempt_load(self) -> bool:
    +    """
    +    Load precomputed statistics file and return the success of the operation
    +    """
    +    try:
    +        self.result = load_pkl(self.preprocess_path)
    +        logger.info(f"Statistics for {str(self)} loaded successfully")
    +        return True
    +    except FileNotFoundError:
    +        logger.warning(f"Statistics for {str(self)} not found. Computing...")
    +        return False
    +
    +
    +
    + +
    + +
    + + +

    + compute() + + + abstractmethod + + +

    + + +
    + +

    Abstract method to compute the statistics. +Must return a StatisticsResults object and be implemented +in all the childs

    + +
    + Source code in openqdc/datasets/statistics.py +
    256
    +257
    +258
    +259
    +260
    +261
    +262
    +263
    @abstractmethod
    +def compute(self) -> StatisticsResults:
    +    """
    +    Abstract method to compute the statistics.
    +    Must return a StatisticsResults object and be implemented
    +    in all the childs
    +    """
    +    raise NotImplementedError
    +
    +
    +
    + +
    + +
    + + +

    + from_openqdc_dataset(dataset, recompute=False) + + + classmethod + + +

    + + +
    + +

    Create a calculator object from a dataset object.

    + +
    + Source code in openqdc/datasets/statistics.py +
    236
    +237
    +238
    +239
    +240
    +241
    +242
    +243
    +244
    +245
    +246
    +247
    +248
    +249
    +250
    +251
    +252
    +253
    +254
    @classmethod
    +def from_openqdc_dataset(cls, dataset, recompute: bool = False):
    +    """
    +    Create a calculator object from a dataset object.
    +    """
    +    obj = cls(
    +        name=dataset.__name__,
    +        force_recompute=recompute,
    +        energy_type=dataset.energy_type,
    +        energies=dataset.data["energies"],
    +        forces=dataset.data["forces"] if "forces" in dataset.data else None,
    +        n_atoms=dataset.data["n_atoms"],
    +        position_idx_range=dataset.data["position_idx_range"],
    +        atom_species=dataset.data["atomic_inputs"][:, 0].ravel(),
    +        atom_charges=dataset.data["atomic_inputs"][:, 1].ravel(),
    +        e0_matrix=dataset.__isolated_atom_energies__,
    +    )
    +    obj._root = dataset.root  # set to the dataset root in case of multiple datasets
    +    return obj
    +
    +
    +
    + +
    + +
    + + +

    + run(state) + +

    + + +
    + +

    Main method to run the calculator. +Setup the dependencies from the state dictionary +Check if the statistics are already computed and load them or +recompute them +Save the statistics in the correct folder

    + + +
    + state +

    dictionary containing the state of the calculator

    +
    +
    + Source code in openqdc/datasets/statistics.py +
    304
    +305
    +306
    +307
    +308
    +309
    +310
    +311
    +312
    +313
    +314
    +315
    +316
    +317
    +318
    def run(self, state: Dict) -> None:
    +    """
    +    Main method to run the calculator.
    +    Setup the dependencies from the state dictionary
    +    Check if the statistics are already computed and load them or
    +    recompute them
    +    Save the statistics in the correct folder
    +
    +    state:
    +        dictionary containing the state of the calculator
    +    """
    +    self._setup_deps(state)
    +    if self.force_recompute or not self.attempt_load():
    +        self.result = self.compute()
    +        self.save_statistics()
    +
    +
    +
    + +
    + +
    + + +

    + save_statistics() + +

    + + +
    + +

    Save statistics file to the dataset folder as a pkl file

    + +
    + Source code in openqdc/datasets/statistics.py +
    265
    +266
    +267
    +268
    +269
    def save_statistics(self) -> None:
    +    """
    +    Save statistics file to the dataset folder as a pkl file
    +    """
    +    save_pkl(self.result, self.preprocess_path)
    +
    +
    +
    + +
    + +
    + + +

    + write_state(update) + +

    + + +
    + +

    Write/update the state dictionary with the update dictionary

    + + +
    + update +

    dictionary containing the update to the state

    +
    +
    + Source code in openqdc/datasets/statistics.py +
    295
    +296
    +297
    +298
    +299
    +300
    +301
    +302
    def write_state(self, update: Dict) -> None:
    +    """
    +    Write/update the state dictionary with the update dictionary
    +
    +    update:
    +        dictionary containing the update to the state
    +    """
    +    self.state.update(update)
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + +
    + + + +

    + EnergyStatistics + + + + dataclass + + +

    + + +
    +

    + Bases: StatisticsResults

    + + +

    Dataclass for energy related statistics

    + +
    + Source code in openqdc/datasets/statistics.py +
    41
    +42
    +43
    +44
    +45
    +46
    +47
    +48
    @dataclass
    +class EnergyStatistics(StatisticsResults):
    +    """
    +    Dataclass for energy related statistics
    +    """
    +
    +    mean: Optional[np.ndarray]
    +    std: Optional[np.ndarray]
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + ForceStatistics + + + + dataclass + + +

    + + +
    +

    + Bases: StatisticsResults

    + + +

    Dataclass for force statistics

    + +
    + Source code in openqdc/datasets/statistics.py +
    51
    +52
    +53
    +54
    +55
    +56
    +57
    +58
    +59
    +60
    +61
    @dataclass
    +class ForceStatistics(StatisticsResults):
    +    """
    +    Dataclass for force statistics
    +    """
    +
    +    mean: Optional[np.ndarray]
    +    std: Optional[np.ndarray]
    +    component_mean: Optional[np.ndarray]
    +    component_std: Optional[np.ndarray]
    +    component_rms: Optional[np.ndarray]
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + ForcesCalculatorStats + + +

    + + +
    +

    + Bases: AbstractStatsCalculator

    + + +

    Forces statistics calculator class

    + +
    + Source code in openqdc/datasets/statistics.py +
    324
    +325
    +326
    +327
    +328
    +329
    +330
    +331
    +332
    +333
    +334
    +335
    +336
    +337
    +338
    +339
    +340
    +341
    +342
    +343
    +344
    +345
    class ForcesCalculatorStats(AbstractStatsCalculator):
    +    """
    +    Forces statistics calculator class
    +    """
    +
    +    def compute(self) -> ForceStatistics:
    +        if not self.has_forces:
    +            return ForceStatistics(mean=None, std=None, component_mean=None, component_std=None, component_rms=None)
    +        converted_force_data = self.forces
    +        num_methods = converted_force_data.shape[2]
    +        mean = np.nanmean(converted_force_data.reshape(-1, num_methods), axis=0)
    +        std = np.nanstd(converted_force_data.reshape(-1, num_methods), axis=0)
    +        component_mean = np.nanmean(converted_force_data, axis=0)
    +        component_std = np.nanstd(converted_force_data, axis=0)
    +        component_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))
    +        return ForceStatistics(
    +            mean=np.atleast_2d(mean),
    +            std=np.atleast_2d(std),
    +            component_mean=np.atleast_2d(component_mean),
    +            component_std=np.atleast_2d(component_std),
    +            component_rms=np.atleast_2d(component_rms),
    +        )
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + FormationEnergyInterface + + +

    + + +
    +

    + Bases: AbstractStatsCalculator, ABC

    + + +

    Formation Energy interface calculator class. +Define the use of the dependency formation_energy in the +compute method

    + +
    + Source code in openqdc/datasets/statistics.py +
    360
    +361
    +362
    +363
    +364
    +365
    +366
    +367
    +368
    +369
    +370
    +371
    +372
    +373
    +374
    +375
    +376
    +377
    +378
    +379
    +380
    +381
    +382
    +383
    +384
    +385
    +386
    +387
    +388
    +389
    +390
    +391
    +392
    +393
    +394
    +395
    +396
    +397
    +398
    +399
    class FormationEnergyInterface(AbstractStatsCalculator, ABC):
    +    """
    +    Formation Energy interface calculator class.
    +    Define the use of the dependency formation_energy in the
    +    compute method
    +    """
    +
    +    state_dependency = ["formation_energy"]
    +
    +    def compute(self) -> EnergyStatistics:
    +        # if the state has not the dependency satisfied
    +        if not self.deps_satisfied:
    +            # run the main computation
    +            from openqdc.utils.constants import MAX_CHARGE
    +
    +            splits_idx = self.position_idx_range[:, 1]
    +            s = np.array(self.atom_species_charges_tuple, dtype=int)
    +            s[:, 1] += MAX_CHARGE
    +            matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.e0_matrix]
    +            converted_energy_data = self.energies
    +            E = []
    +            for i, matrix in enumerate(matrixs):
    +                c = np.cumsum(np.append([0], matrix))[splits_idx]
    +                c[1:] = c[1:] - c[:-1]
    +                E.append(converted_energy_data[:, i] - c)
    +        else:
    +            # if the dependency is satisfied get the dependency
    +            E = getattr(self, self.state_dependency[0])
    +        self.write_state({self.state_dependency[0]: E})
    +        E = np.array(E).T
    +        return self._compute(E)
    +
    +    @abstractmethod
    +    def _compute(self, energy) -> EnergyStatistics:
    +        raise NotImplementedError
    +
    +    def __str__(self) -> str:
    +        # override the __str__ method to add the energy type to the name
    +        # to differentiate between formation and regression type
    +        return f"{self.__class__.__name__.lower()}_{self.energy_type.lower()}"
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + FormationEnergyStats + + +

    + + +
    +

    + Bases: FormationEnergyInterface

    + + +

    Formation Energy calculator class.

    + +
    + Source code in openqdc/datasets/statistics.py +
    402
    +403
    +404
    +405
    +406
    +407
    +408
    +409
    +410
    class FormationEnergyStats(FormationEnergyInterface):
    +    """
    +    Formation Energy  calculator class.
    +    """
    +
    +    def _compute(self, energy) -> EnergyStatistics:
    +        formation_E_mean = np.nanmean(energy, axis=0)
    +        formation_E_std = np.nanstd(energy, axis=0)
    +        return EnergyStatistics(mean=np.atleast_2d(formation_E_mean), std=np.atleast_2d(formation_E_std))
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + PerAtomFormationEnergyStats + + +

    + + +
    +

    + Bases: FormationEnergyInterface

    + + +

    Per atom Formation Energy calculator class.

    + +
    + Source code in openqdc/datasets/statistics.py +
    413
    +414
    +415
    +416
    +417
    +418
    +419
    +420
    +421
    class PerAtomFormationEnergyStats(FormationEnergyInterface):
    +    """
    +    Per atom Formation Energy  calculator class.
    +    """
    +
    +    def _compute(self, energy) -> EnergyStatistics:
    +        inter_E_mean = np.nanmean((energy / self.n_atoms[:, None]), axis=0)
    +        inter_E_std = np.nanstd((energy / self.n_atoms[:, None]), axis=0)
    +        return EnergyStatistics(mean=np.atleast_2d(inter_E_mean), std=np.atleast_2d(inter_E_std))
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + +
    + + + +

    + StatisticManager + + +

    + + +
    + + +

    Manager class that automatically handle the shared state between +the statistic calculators

    + +
    + Source code in openqdc/datasets/statistics.py +
     64
    + 65
    + 66
    + 67
    + 68
    + 69
    + 70
    + 71
    + 72
    + 73
    + 74
    + 75
    + 76
    + 77
    + 78
    + 79
    + 80
    + 81
    + 82
    + 83
    + 84
    + 85
    + 86
    + 87
    + 88
    + 89
    + 90
    + 91
    + 92
    + 93
    + 94
    + 95
    + 96
    + 97
    + 98
    + 99
    +100
    +101
    +102
    +103
    +104
    +105
    +106
    +107
    +108
    +109
    +110
    +111
    +112
    +113
    +114
    +115
    +116
    +117
    +118
    +119
    +120
    +121
    +122
    +123
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    +131
    +132
    +133
    +134
    +135
    +136
    +137
    +138
    +139
    +140
    +141
    +142
    +143
    +144
    +145
    +146
    +147
    +148
    +149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    class StatisticManager:
    +    """
    +    Manager class that automatically handle the shared state between
    +    the statistic calculators
    +    """
    +
    +    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"):
    +        """
    +        Parameters:
    +            dataset : openqdc.datasets.base.BaseDataset
    +                The dataset object to compute the statistics
    +            recompute:
    +                Flag to recompute the statistics
    +            *statistic_calculators:
    +                List of statistic calculators to run
    +        """
    +        self._state = {}
    +        self._results = {}
    +        self._statistic_calculators = [
    +            statistic_calculators.from_openqdc_dataset(dataset, recompute)
    +            for statistic_calculators in statistic_calculators
    +        ]
    +
    +    @property
    +    def state(self) -> Dict:
    +        """
    +        Return the dictionary state of the manager
    +
    +        Returns:
    +            State of the StatisticManager
    +        """
    +        return self._state
    +
    +    def reset_state(self):
    +        """
    +        Reset the state dictionary
    +        """
    +        self._state = {}
    +
    +    def reset_results(self):
    +        """
    +        Reset the results dictionary
    +        """
    +        self._results = {}
    +
    +    def get_state(self, key: Optional[str] = None) -> Optional[Any]:
    +        """
    +        Return the value of the key in the state dictionary
    +
    +        Parameters:
    +            key: str, default = None
    +        Returns:
    +            the value of the key in the state dictionary
    +            or the whole state dictionary if key is None
    +        """
    +        if key is None:
    +            return self._state
    +        return self._state.get(key, None)
    +
    +    def has_state(self, key: str) -> bool:
    +        """
    +        Check is state has key
    +
    +        Parameters:
    +            key:
    +                Key to check in the state dictionary
    +
    +        Returns:
    +            True if the key is in the state dictionary
    +        """
    +        return key in self._state
    +
    +    def get_results(self, as_dict: bool = False):
    +        """
    +        Aggregate results from all the calculators
    +
    +        Parameters:
    +            as_dict:
    +                Flag to return the results as a dictionary
    +        """
    +        results = deepcopy(self._results)
    +        if as_dict:
    +            return {k: v.as_dict() for k, v in results.items()}
    +        return {k: v for k, v in self._results.items()}
    +
    +    def run_calculators(self):
    +        """
    +        Run the saved calculators and save the results in the manager
    +        """
    +        logger.info("Processing dataset statistics")
    +        for calculator in self._statistic_calculators:
    +            calculator.run(self.state)
    +            self._results[calculator.__class__.__name__] = calculator.result
    +
    +
    + + + +
    + + + + + + + +
    + + + +

    + state: Dict + + + property + + +

    + + +
    + +

    Return the dictionary state of the manager

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Dict + +
    +

    State of the StatisticManager

    +
    +
    +
    + +
    + + + +
    + + +

    + __init__(dataset, recompute=False, *statistic_calculators) + +

    + + +
    + + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    dataset + +
    +

    openqdc.datasets.base.BaseDataset +The dataset object to compute the statistics

    +
    +
    + required +
    recompute + bool + +
    +

    Flag to recompute the statistics

    +
    +
    + False +
    *statistic_calculators + AbstractStatsCalculator + +
    +

    List of statistic calculators to run

    +
    +
    + () +
    + +
    + Source code in openqdc/datasets/statistics.py +
    70
    +71
    +72
    +73
    +74
    +75
    +76
    +77
    +78
    +79
    +80
    +81
    +82
    +83
    +84
    +85
    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"):
    +    """
    +    Parameters:
    +        dataset : openqdc.datasets.base.BaseDataset
    +            The dataset object to compute the statistics
    +        recompute:
    +            Flag to recompute the statistics
    +        *statistic_calculators:
    +            List of statistic calculators to run
    +    """
    +    self._state = {}
    +    self._results = {}
    +    self._statistic_calculators = [
    +        statistic_calculators.from_openqdc_dataset(dataset, recompute)
    +        for statistic_calculators in statistic_calculators
    +    ]
    +
    +
    +
    + +
    + +
    + + +

    + get_results(as_dict=False) + +

    + + +
    + +

    Aggregate results from all the calculators

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    as_dict + bool + +
    +

    Flag to return the results as a dictionary

    +
    +
    + False +
    + +
    + Source code in openqdc/datasets/statistics.py +
    136
    +137
    +138
    +139
    +140
    +141
    +142
    +143
    +144
    +145
    +146
    +147
    def get_results(self, as_dict: bool = False):
    +    """
    +    Aggregate results from all the calculators
    +
    +    Parameters:
    +        as_dict:
    +            Flag to return the results as a dictionary
    +    """
    +    results = deepcopy(self._results)
    +    if as_dict:
    +        return {k: v.as_dict() for k, v in results.items()}
    +    return {k: v for k, v in self._results.items()}
    +
    +
    +
    + +
    + +
    + + +

    + get_state(key=None) + +

    + + +
    + +

    Return the value of the key in the state dictionary

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    key + Optional[str] + +
    +

    str, default = None

    +
    +
    + None +
    +

    Returns: + the value of the key in the state dictionary + or the whole state dictionary if key is None

    + +
    + Source code in openqdc/datasets/statistics.py +
    109
    +110
    +111
    +112
    +113
    +114
    +115
    +116
    +117
    +118
    +119
    +120
    +121
    def get_state(self, key: Optional[str] = None) -> Optional[Any]:
    +    """
    +    Return the value of the key in the state dictionary
    +
    +    Parameters:
    +        key: str, default = None
    +    Returns:
    +        the value of the key in the state dictionary
    +        or the whole state dictionary if key is None
    +    """
    +    if key is None:
    +        return self._state
    +    return self._state.get(key, None)
    +
    +
    +
    + +
    + +
    + + +

    + has_state(key) + +

    + + +
    + +

    Check is state has key

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    key + str + +
    +

    Key to check in the state dictionary

    +
    +
    + required +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + bool + +
    +

    True if the key is in the state dictionary

    +
    +
    + +
    + Source code in openqdc/datasets/statistics.py +
    123
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    +131
    +132
    +133
    +134
    def has_state(self, key: str) -> bool:
    +    """
    +    Check is state has key
    +
    +    Parameters:
    +        key:
    +            Key to check in the state dictionary
    +
    +    Returns:
    +        True if the key is in the state dictionary
    +    """
    +    return key in self._state
    +
    +
    +
    + +
    + +
    + + +

    + reset_results() + +

    + + +
    + +

    Reset the results dictionary

    + +
    + Source code in openqdc/datasets/statistics.py +
    103
    +104
    +105
    +106
    +107
    def reset_results(self):
    +    """
    +    Reset the results dictionary
    +    """
    +    self._results = {}
    +
    +
    +
    + +
    + +
    + + +

    + reset_state() + +

    + + +
    + +

    Reset the state dictionary

    + +
    + Source code in openqdc/datasets/statistics.py +
     97
    + 98
    + 99
    +100
    +101
    def reset_state(self):
    +    """
    +    Reset the state dictionary
    +    """
    +    self._state = {}
    +
    +
    +
    + +
    + +
    + + +

    + run_calculators() + +

    + + +
    + +

    Run the saved calculators and save the results in the manager

    + +
    + Source code in openqdc/datasets/statistics.py +
    149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    def run_calculators(self):
    +    """
    +    Run the saved calculators and save the results in the manager
    +    """
    +    logger.info("Processing dataset statistics")
    +    for calculator in self._statistic_calculators:
    +        calculator.run(self.state)
    +        self._results[calculator.__class__.__name__] = calculator.result
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + +
    + + + +

    + StatisticsResults + + +

    + + +
    + + +

    Parent class to statistics results +to provide general methods.

    + +
    + Source code in openqdc/datasets/statistics.py +
    13
    +14
    +15
    +16
    +17
    +18
    +19
    +20
    +21
    +22
    +23
    +24
    +25
    +26
    +27
    +28
    +29
    +30
    +31
    +32
    +33
    +34
    +35
    +36
    +37
    +38
    class StatisticsResults:
    +    """
    +    Parent class to statistics results
    +    to provide general methods.
    +    """
    +
    +    def to_dict(self) -> Dict:
    +        """
    +        Convert the class to a dictionary
    +
    +        Returns:
    +            Dictionary representation of the class
    +        """
    +        return asdict(self)
    +
    +    def transform(self, func: Callable):
    +        """
    +        Apply a function to all the attributes of the class
    +
    +        Parameters:
    +            func:
    +                Function to apply to the attributes
    +        """
    +        for k, v in self.to_dict().items():
    +            if v is not None:
    +                setattr(self, k, func(v))
    +
    +
    + + + +
    + + + + + + + + + +
    + + +

    + to_dict() + +

    + + +
    + +

    Convert the class to a dictionary

    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Dict + +
    +

    Dictionary representation of the class

    +
    +
    + +
    + Source code in openqdc/datasets/statistics.py +
    19
    +20
    +21
    +22
    +23
    +24
    +25
    +26
    def to_dict(self) -> Dict:
    +    """
    +    Convert the class to a dictionary
    +
    +    Returns:
    +        Dictionary representation of the class
    +    """
    +    return asdict(self)
    +
    +
    +
    + +
    + +
    + + +

    + transform(func) + +

    + + +
    + +

    Apply a function to all the attributes of the class

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    func + Callable + +
    +

    Function to apply to the attributes

    +
    +
    + required +
    + +
    + Source code in openqdc/datasets/statistics.py +
    28
    +29
    +30
    +31
    +32
    +33
    +34
    +35
    +36
    +37
    +38
    def transform(self, func: Callable):
    +    """
    +    Apply a function to all the attributes of the class
    +
    +    Parameters:
    +        func:
    +            Function to apply to the attributes
    +    """
    +    for k, v in self.to_dict().items():
    +        if v is not None:
    +            setattr(self, k, func(v))
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + +
    + + + +

    + TotalEnergyStats + + +

    + + +
    +

    + Bases: AbstractStatsCalculator

    + + +

    Total Energy statistics calculator class

    + +
    + Source code in openqdc/datasets/statistics.py +
    348
    +349
    +350
    +351
    +352
    +353
    +354
    +355
    +356
    +357
    class TotalEnergyStats(AbstractStatsCalculator):
    +    """
    +    Total Energy statistics calculator class
    +    """
    +
    +    def compute(self) -> EnergyStatistics:
    +        converted_energy_data = self.energies
    +        total_E_mean = np.nanmean(converted_energy_data, axis=0)
    +        total_E_std = np.nanstd(converted_energy_data, axis=0)
    +        return EnergyStatistics(mean=np.atleast_2d(total_E_mean), std=np.atleast_2d(total_E_std))
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + + + + +
    + +
    + +
    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/stable/API/units.html b/stable/API/units.html index ea3f0e7..0c7b3de 100644 --- a/stable/API/units.html +++ b/stable/API/units.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/API/utils.html b/stable/API/utils.html index 019804c..0f5a370 100644 --- a/stable/API/utils.html +++ b/stable/API/utils.html @@ -712,12 +712,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/cli.html b/stable/cli.html index 0b43ef7..a8abde4 100644 --- a/stable/cli.html +++ b/stable/cli.html @@ -815,12 +815,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/contribute.html b/stable/contribute.html index 7804bea..0f024bb 100644 --- a/stable/contribute.html +++ b/stable/contribute.html @@ -707,12 +707,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/data_storage.html b/stable/data_storage.html index f913567..9460663 100644 --- a/stable/data_storage.html +++ b/stable/data_storage.html @@ -761,12 +761,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/dataset_upload.html b/stable/dataset_upload.html index 08800d5..a397489 100644 --- a/stable/dataset_upload.html +++ b/stable/dataset_upload.html @@ -707,12 +707,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/datasets.html b/stable/datasets.html index 1c75653..e850947 100644 --- a/stable/datasets.html +++ b/stable/datasets.html @@ -717,12 +717,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/index.html b/stable/index.html index 3b52948..e73e2ed 100644 --- a/stable/index.html +++ b/stable/index.html @@ -781,12 +781,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • @@ -1852,12 +1961,12 @@

    Overviewhttps://openqdc.io .

    Installation

    Use mamba:

    -
    mamba install -c conda-forge openqdc
    +
    conda install -c conda-forge openqdc
     
    -

    Tips: You can replace mamba by conda.

    +

    Tips: You can replace conda by mamba.

    Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: pip install openqdc.

    Quick API Tour

    from openqdc as Spice
    @@ -1890,7 +1999,7 @@ 

    Quick API Tour)

    How to cite

    -

    Please cite OpenQDC if you use it in your research: DOI.

    +

    Please cite OpenQDC if you use it in your research: Pending Publication.

    Compatibilities

    OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.

    diff --git a/stable/licensing.html b/stable/licensing.html index 4208532..c301677 100644 --- a/stable/licensing.html +++ b/stable/licensing.html @@ -700,12 +700,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/normalization_e0s.html b/stable/normalization_e0s.html index 0e83287..40dd8bc 100644 --- a/stable/normalization_e0s.html +++ b/stable/normalization_e0s.html @@ -789,12 +789,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • diff --git a/stable/objects.inv b/stable/objects.inv index 4c938b7bfd17061b0cbab4fe4f050a5658adfccc..b0fd3ac522d75ddbe4ff74f4df5a993978b4cb31 100644 GIT binary patch delta 3559 zcmV+3em>7-X9X zA042R?)mjAi3WtEBLq5hvp17WT6*3WeIXr?$krtN5eAFUFa3g)#p2@A_1TN{?w>*I z7sZR^i;LUq-|8QK^V4YYTqg0~GdbHjD@j^Le!O65IOHFH%YSt6%gb*le8zz#yg!y$ zDhve(;amxH9ZMAaQjd>=XOiq-f$H~uk$t+Fr;)n+yt;h@3I$oRe$y*crK(68iK`Fy zFV9c^1&V~E03@QcB)K1yQI;-Ps+y!1dA-7spd7ux{hK#93Je5FY~P}ZP?SdI_p8+n zDDe~~ac_n5M1LHk!+-b;SHu&^l72i(#4#HD_5J6wbC4eunRj;HtFTX2u{0t#utI!p zQ3W*7jTXGRTEatL5-yj$)_Ee1(c-VFzk(X;oRXsSG54PmN-TQylnkwg&HyO`KYoNS zvHIPj2?i*r_OCvE1w~VqAbc!)C7Xl|QX}^{e@IDrMN&VGK*@|J@e25I(C_r<%~N53(_by{ELBBzKk~o{R1w-Uo2IyPt_~kO|7cvtbcr3e);wZ)GyMdmv7Tv~KB2eXr|cJ3awuXtL+dCYunYEUzcNo%kst{}(;Y0LZ@JY^H<&s% zfKbfmmIk3JfyKnMN!ma)f?rp6@FJ*8EM^lt5y$B8xt|shvtx*Ilx?E^v9go6gV)Ia zaeud3z5;cB{z#UudIhUg6-gs8Vw8F5y*wQ@((oL;Bh9y0r=WrDt5b_6!%{Jd2L3?* zVDuyl>4M|M8OO7K+^4AMCs5YKMVTcZQj$MK>c~V+y)Y`)ei=N2Et~A~c0rKZd-@+` zg*n)>Kv{U+W0nWRqszVLEG#^aas9H)U4OAr8kH68ag*|Da3vN=6qCXWvh*j(Y2{+l z{lbg(;}a5)di0-&mwG;}*rF-bs%6mi!kS(~1mC0D1bgCb^)I_sw7pIhyp1KgjHiog z+S}*~un6*KU1n9+#<}9)+0UN{`jysLI}0R<=w*7vngr1lUL?MisVwG&9!EuqSAW{% zv=9atoKDNfqFp@{;4q&^>CwNmy#VL(8MY|>l9p){AaWx*zWB66LsaBCC$wn=q$n`K zNtAktzx5{ScrxC`QkwB`AMYB|P4cjxfM8x*jkU4RRvzAE<-`xY-2dg#Sy8?psBp*y zy)$@ISmxmE?k_|li$&57?5A}8v425rHVnNV&2YhGhFb=kb)5N*V}h=Q%TrO;9s`T- zEAZ=XO)mM&tY8-*$+Jq3ls9D*7mMl#00L?9Y4XlVQHy}|KC&6IlbnuVR4Xp`leH|r zr?)v}Y0=*Mfcg{?=z#|L#Agk$Dk2PZl?REhNV4+Rz%#T*PJ$)#O;B!f8Giyv-IAOw zxv($SgoUhpAx}9St1~4$lGi`zT2}jGn;%lRuMMvJlr9@`mn&3Qkm%Q}^kbxARalVd zfBignY(F9bl40}DpfegYCtncqfC`MaI_PVP*1~g$5seF^G@LHHE*>G&sn;bs6wD?d;Pgj*;3~i_(|y zh_GvK4O;q2Bx_IX#ZI2ls4cX9Kmddbs~tAAg0bMz)qKk-@d zXreq9+s7EY9_oK2=_XMLCbj(xxS4M|H#r$Nk4C4sqIFA>HJj-9DEC}N;K=P-=jYfh zTk+IJ+dOLE+4X*@@Ok8o&ifE6c$%eVqf&eFXt9=AzQb_Nnz*<2;jhh{z zDYtM6LWkST*(N@~w$3);#2X3)p=q}kw#g2%$*@g1@vcBo=*as61uMtdyeAkq&NjX_ z%YirawaJgZwXe;0z)gOV==9tD1i^W2{A;sMyali;bfC?EqUdDX0y`p8ZV>DWO}WLOHt~!$915lmwe?VtoZBWu!P@lO z5hbzdHzvx02iT(6rk!~2pdfVIU4(761MDYk6V7O7p|ZG#16rLZ!HE@2jXbz8)5u`1 zj+XR8q&S%)s1pfuXW4G@bJ6JBv{7*VuH9mGdcKuGw13)Jx|4V~3vRISc4 znXub#Sbx+Kr-`<@U_0?n*s3OoIs^I=hQQXUiw)EuhIS2-i z2_9~TC?Oweg^C7Fi9^(Y&gMdtkdMY1fbO20)@gI{p&UK=7Z(pj$zJ8=r*gDs%*f3S z32WL|k0sHwrxHtsJv-QxWI~0ZN9!l?K5IGonFOP`Z@?69+X^RM(MbJTf3#TQa+;<}M(Lk-z%iJzo} zSKt!v$mOlLnMZNMC!skMx(qdK?;moly|=9~-oP6Yp1+R77>~X88a%IBsWF~AH-9WV ze;hl?&nBIy$QXHyBh#G`&z-C?#wm1X#d9et^g0yJ{sT{&8ZmNdx$_yk^!8>qe(7z_ zehAasKI$k2SUT=NrZ+Xyu}p7v#={xbBurt>a~PZH6il_Kiznc@tLY<*{a=l%Gng4_ z?9698o%Os>H9Dtp)at$m<~=0W!j>t1g7)9Nxh(H z&XZh0pjZD3d9o7Ja}-Y!(X?+;mu^?rr4U-Y)!E@?5i9hI8M0Wbd6%!hcMqiI5Yr|C@_W_)WU@ zgYS-1;LB=`AVd2d_RX!Hj(;)05!mP*4RyE!{OCK1qnQ>uz=@_EIojA0jUKcgWE z1k5ZNo#^NIO-^)kum)Fuu7$k=?tveB>wM|uq%S*L{t@95AJT}|3*(=T8SbI_;>>k> zEVcaA8aGb&8g-xTLLzIwjbibwox8+@gWQ1M_!)Ff5(TZS%-qA4RDYdoWDx5dQ_BjA?W8}^k5f%y6#3yv(!ud)QiGx zF@~fa`z-nxJjPjM<{wU1XMpW@)`2XG%g#(ptj(W{D|cF ztLdxvm!E&F4*p7#aB^S7(a)*swoVJ06d{Qw+(tlt`6-3q=YQunPWYUIOL%`yU8!&s zBIM>wu=AKY!B1g)sQB`t;O|0tcZI+jq`H7z&X2eZE|v5_dry zbzX2`Brp*E<9`?YLfp|b?#6Q?fdTNJufH76P=1)F-tnwcVP{sd0Fec*5X}rO0W(^F z;N^UZ-}>TUI_(Ty7zqrBpO?R)8taUa!eo_^yO+EzfY-(tqS3KI@jPO(ifOdqHj^GXL}q z6-~H7uqrww_lA2UAonG^O=vNv35h~fG8IN%Agnrdo7N2hAb`vBtEKnhWbyYU-kDjm zb$4KHB*8GL%jKynA&!D%%AxeB8cHJ$(v}nu&eihx0~*3Ze08urWtKHHWU1h1P7_ftS7!|c$li}tyN;qM%<-@)iJ2k0{x zy}8RZaCX!H&#g)lCZWH!Jynp)`;|T}=bAZDMjzwAU{X}sRXjlI`uzRHUxYoR&!bJw ze&9{`gIoDKt4`^rbz21ra` z^wvAeZt<2!OAHEV9r_fHf*MRhUNX#uYLZyMh#B^`E zhXR7%mRI;9sE7?_6T(PfAbduWJmleMvK&Qi)P1jP7&q{M{GV6L=^Iq{@gbhR=@cxj zN`Dd{G2kw9;+?$icc&2^op+j>H?PqEwr^e=bQx~NDFFPI{owRC4cLO?)v>_y|J+%W z^dro5@`3%+TJzaZCZ4xSGoN~F0d}7Tx#w{%DT>S%8zx~<(pH;9G>k1VkHd)Oo}VU< zG-J)}l1_3j+}$zU0NE;*ttnY&V+)|%&ws*ok(NE%5k8FsJPPx|9hD|DyL+nof}5Uj z$JT)C7mpNa?63hiHZMrQ&Oh`Wazoa^h<{H5r^t0i*`m#-dF~RNq+*o01J+@>V`h{D zUPgX+?3SE`)FC-1kJK9@WC7kj{pXO#6Ctdtl7!uAH%_k^NAJe1TX2!OZG+7^N`HxE z@1O75@Y~axo{Q%|&N<5Snzw^89p4mTluyb72n5soC>qyF5kz2mpZN3GN)94mYD*d~ z$r{Ncv#CK$(1cW8WE;QOWGV!c`bzIIHcr#lygaK8K&Mn6to<>Ce@K>Do{EgXWcahk zZ(N*xk2bnKioZ@KO4&rBd?GS|ND-mQgayUQFIVA{jkmL`u?TUKKz zI)KRoZ7?pH;cZ`yAOlEaAP&XE^*DnAY78_}TzmLhQFCcb!cjH{r7;i~?SFRQZ8?D9 zAonBK1k=11^di*>uTe|Jnld97kF~_cFC%M8j#yCE5E`?zOcFX=ky&kgfO50iaO6TK zNod@Xr`qfw#ZR^6$kj@+&@1ull4|#nOOu*H2P#sMMSn*tS89liS-8{`8oh+6AvazzlPoh@S(7X>T7i=+GJ2_# zB6f(Pr`q;_bQTt3B zirOdiI#6Yz4L?0OUuT3;^M7u5y)SPM9iIQwyja?&5DhR&Hqr|9ENg~37pQ3wCAD~D ziUjkP&fubCkNvId6@Pu{`^2QCHIzK3F%nFo+QF+_CLV1wEoz7ZvaL4QM!pjd6-c7? z0VA1OYZ0Eh#DlK0p9UB(gKH0LWM>ECl;o)9l)=1fPdxN?WfUDeo$Nx6r&#fps^?rh$P}No=gd0jW807c|Wca~N;o4p{CqqJEkM+Dl`MIc&r>2f?g;I&>KuCL{h+VtZ9 z{%;y49!Z3PYc}6@=W)YPz%1a5cq{&J9EHD1lzqrn$barVqgMD$vL^m_ODeJ@&l05R zzx{T|bqtIi5uu&lGN1;xM?COAu?(h$_J{=NN0wpih}syy?UVhxorHN2olcJD`MaDP z&A~d{?u!l+{t^7Ww(m<`5L4HtU1Zf4;D3+uVS@%8ZBzq%!x!L-=j))hQ9LpB zzd8I-2`yGQVZ`wl=$e+lLHz0~+O` zuU{MNeITCeuV#SVqvyQ#Ifjz;%uj7gv5u^{vG#)a_mt-`qlSPTN7CTZd&B=N#vn-3v@_dbI$hTr21oW*C3+B3lVt4N5;1p4IF0B7AeG5~db mWy})9rp?3R(tnA}3Ntt+?bpg|D1#&Fo(a)68UF(x?+^6u6Z)$F diff --git a/stable/search/search_index.json b/stable/search/search_index.json index bfce75f..3c3c267 100644 --- a/stable/search/search_index.json +++ b/stable/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"

    OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.

    • \ud83d\udc0d Simple pythonic API
    • \ud83d\udd79\ufe0f ML-Ready: all you manipulate are torch.Tensor,jax.Array or numpy.Arrayobjects.
    • \u269b\ufe0f Quantum Ready: The quantum methods are checked and standardized to provide addictional values.
    • \u2705 Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels.
    • \ud83e\udde0 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
    • \ud83d\udcc8 Data: have access to 1.5+ billion datapoints

    Visit our website at TOFILL ."},{"location":"index.html#installation","title":"Installation","text":"

    Use mamba:

    mamba install -c conda-forge openqdc\n

    Tips: You can replace mamba by conda.

    Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: pip install openqdc.

    "},{"location":"index.html#quick-api-tour","title":"Quick API Tour","text":"
    from openqdc as Spice\n\n# Load the original dataset\ndataset = Spice()\n\n# Load the dataset with a different units\ndataset = Spice(\n    energy_unit = \"kcal/mol\",\n    distance_unit = \"ang\",\n    energy_type = \"formation\",\n    array_format = \"torch\"\n)\n\n# Access the data\ndata = dataset[0]\n\n# Get relevant statistics\ndataset.get_statistics()\n\n# Get dataset metadata\ndataset.average_n_atoms\ndataset.chemical_species\ndataset.charges\n\n# Compute physical descriptors\ndataset.calculate_descriptors(\n    descriptor_name = \"soap\"\n)\n
    "},{"location":"index.html#how-to-cite","title":"How to cite","text":"

    Please cite OpenQDC if you use it in your research: .

    "},{"location":"index.html#compatibilities","title":"Compatibilities","text":"

    OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.

    "},{"location":"cli.html","title":"CLI for dataset downloading and uploading","text":"

    You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).

    "},{"location":"cli.html#datasets","title":"Datasets","text":"

    Print a formatted table of the available openQDC datasets and some informations.

    Usage:

    openqdc datasets [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#cache","title":"Cache","text":"

    Get the current local cache path of openQDC

    Usage:

    openqdc cache [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#download","title":"Download","text":"

    Download preprocessed ml-ready datasets from the main openQDC hub.

    Usage:

    openqdc download DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n--as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]\n--gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]\n

    Example:

    openqdc download Spice\n
    "},{"location":"cli.html#fetch","title":"Fetch","text":"

    Download the raw datasets files from the main openQDC hub

    Note:

    Special case: if the dataset is \"all\", \"potential\", \"interaction\".\n

    Usage:

    openqdc fetch DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n

    Example:

    openqdc fetch Spice\n
    "},{"location":"cli.html#preprocess","title":"Preprocess","text":"

    Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.

    Usage:

    openqdc preprocess DATASETS... [OPTIONS]\n

    Options:

    --help         Show this message and exit.\n--overwrite    Whether to overwrite the current cached datasets. [default: overwrite]\n--upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]\n--as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]\n

    Example:

    openqdc preprocess Spice QMugs\n
    "},{"location":"cli.html#upload","title":"Upload","text":"

    Upload a preprocessed dataset to the remote storage

    Usage:

    openqdc upload DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]\n--as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]\n

    Example:

    openqdc upload Spice --overwrite\n
    "},{"location":"cli.html#convert","title":"Convert","text":"

    Convert a preprocessed dataset from a memmap dataset to a zarr dataset.

    Usage:

    openqdc convert DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]\n--download      Whether to force the re-download of the memmap datasets. [default: no-download]\n
    "},{"location":"contribute.html","title":"Contribute","text":"

    The below documents the development lifecycle of OpenQDC.

    "},{"location":"contribute.html#setup-a-dev-environment","title":"Setup a dev environment","text":"
    mamba env create -n openqdc -f env.yml\nmamba activate datamol\npip install -e .\n
    "},{"location":"contribute.html#pre-commit-installation","title":"Pre commit installation","text":"
    pre-commit install\npre-commit run --all-files\n
    "},{"location":"contribute.html#continuous-integration","title":"Continuous Integration","text":"

    OpenQDC uses Github Actions to:

    • Build and test openQDC.
      • Multiple combinations of OS and Python versions are tested.
    • Check the code:
      • Formatting with black.
      • Static type check with mypy.
      • Modules import formatting with isort.
      • Pre-commit hooks.
    • Documentation:
      • Google docstring format.
      • build and deploy the documentation on main and for every new git tag.
    "},{"location":"contribute.html#run-tests","title":"Run tests","text":"
    pytest\n
    "},{"location":"contribute.html#build-the-documentation","title":"Build the documentation","text":"

    You can build and serve the documentation locally with:

    # Build and serve the doc\nmike serve\n

    or with

    mkdocs serve\n
    "},{"location":"contribute.html#multi-versionning","title":"Multi-versionning","text":"

    The doc is built for eash push on main and every git tags using mike. Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.

    "},{"location":"data_storage.html","title":"Data structure","text":""},{"location":"data_storage.html#dataset-structure","title":"Dataset structure","text":"

    For a dataset with N geometries, M atoms across all geometries, ne energy labels, and nf force labels, we use zarr or memory-mapped arrays of various sizes:

    • (M, 5) for atomic numbers (1), charges (1), and positions (3) of individual geometries;
    • (N, 2) for the beginning and end indices of each geometry in the previous array;
    • (N, ne) for the energy labels of each geometry, extendable to store other geometry-level QM properties such as HOMO-LUMO gap;
    • (M, nf , 3) for the force labels of each geometry, extendable to store other atom-level QM properties.

    The memory-mapped files efficiently access data stored on disk or in the cloud without reading them into memory, enabling training on machines with smaller RAM than the dataset size and accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, batching and iteration.

    "},{"location":"data_storage.html#formats","title":"Formats","text":"

    We currently support the following formats:

    1) Zarr : https://zarr.readthedocs.io/en/stable/index.html

    2) Memmap : https://numpy.org/doc/stable/index.html

    "},{"location":"dataset_upload.html","title":"How to Add a Dataset to OpenQDC","text":"

    Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:

    1. Opening a PR to add a new dataset
    2. Request a new dataset through Google Form
    "},{"location":"dataset_upload.html#openqdc-pr-guidelines","title":"OpenQDC PR Guidelines","text":"

    Implement your dataset in the OpenQDC repository by following the guidelines below:

    "},{"location":"dataset_upload.html#dataset-class","title":"Dataset class","text":"
    • The dataset class should be implemented in the openqdc/datasets directory.
    • The dataset class should inherit from the openqdc.datasets.base.BaseDataset class.
    • Add your dataset.py file to the openqdc/datasets/potential or openqdc/datasets/interaction/ directory based on the type of energy.
    • Implement the following for your dataset:
      • Add the metadata of the dataset:
        • Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset.
        • __links__: Dictionary of name and link to download the dataset.
        • __name__: Name of the dataset. This will create a folder with the name of the dataset in the cache directory.
        • The original units for the dataset __energy_unit__ and __distance_unit__.
        • __force_mask__: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans.
        • __energy_methods__: List of the QmMethod methods present in the dataset.
      • read_raw_entries(self) -> List[Dict[str, Any]]: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys:
        • atomic_inputs : Atomic inputs of the molecule. numpy.Float32.
        • name: Atomic numbers of the atoms in the molecule. numpy.Object.
        • subset: Positions of the atoms in the molecule. numpy.Object.
        • energies: Energies of the molecule. numpy.Float64.
        • n_atoms: Number of atoms in the molecule. numpy.Int32
        • forces: Forces of the molecule. [Optional] numpy.Float32.
      • Add the dataset import to the openqdc/datasets/<type_of_dataset>/__init__.py file and to openqdc/__init__.py.
    "},{"location":"dataset_upload.html#test-the-dataset","title":"Test the dataset","text":"

    Try to run the openQDC CLI pipeline with the dataset you implemented.

    Run the following command to download the dataset:

    • Fetch the dataset files
      openqdc fetch DATASET_NAME\n
    • Preprocess the dataset
      openqdc preprocess DATASET_NAME\n
    • Load it on python and check if the dataset is correctly loaded.
      from openqdc import DATASET_NAME\nds=DATASET_NAME()\n

    If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.

    • Select for your PR the dataset label.

    Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.

    "},{"location":"dataset_upload.html#openqdc-google-form","title":"OpenQDC Google Form","text":"

    Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. You can fill out the Google Form here

    As the openQDC team will strive to provide a high quality curation and upload, please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.

    "},{"location":"datasets.html","title":"Overview of Datasets","text":"

    We provide support for the following publicly available QM Datasets.

    Dataset # Molecules # Conformers Average Conformers per Molecule Force Labels Atom Types QM Level of Theory Off-Equilibrium Conformations GEOM 450,000 37,000,000 82 No 18 GFN2-xTB No Molecule3D 3,899,647 3,899,647 1 No 5 B3LYP/6-31G* No NablaDFT 1,000,000 5,000,000 5 No 6 \u03c9B97X-D/def2-SVP QMugs 665,000 2,000,000 3 No 10 GFN2-xTB, \u03c9B97X-D/def2-SVP No Spice 19,238 1,132,808 59 Yes 15 \u03c9B97M-D3(BJ)/def2-TZVPPD Yes ANI 57,462 20,000,000 348 No 4 \u03c9B97x:6-31G(d) Yes tmQM 86,665 No TPSSh-D3BJ/def2-SVP DES370K 3,700 370,000 100 No 20 CCSD(T) Yes DES5M 3,700 5,000,000 1351 No 20 SNS-MP2 Yes OrbNet Denali 212,905 2,300,000 11 No 16 GFN1-xTB Yes SN2RXN 39 452709 11,600 Yes 6 DSD-BLYP-D3(BJ)/def2-TZVP QM7X 6,950 4,195,237 603 Yes 7 PBE0+MBD Yes"},{"location":"licensing.html","title":"License","text":"
    Creative Commons Attribution-NonCommercial 4.0 International\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright and\ncertain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n-   Considerations for licensors: Our public licenses are intended for\n    use by those authorized to give the public permission to use\n    material in ways otherwise restricted by copyright and certain other\n    rights. Our licenses are irrevocable. Licensors should read and\n    understand the terms and conditions of the license they choose\n    before applying it. Licensors should also secure all rights\n    necessary before applying our licenses so that the public can reuse\n    the material as expected. Licensors should clearly mark any material\n    not subject to the license. This includes other CC-licensed\n    material, or material used under an exception or limitation to\n    copyright. More considerations for licensors :\n    wiki.creativecommons.org/Considerations\\_for\\_licensors\n\n-   Considerations for the public: By using one of our public licenses,\n    a licensor grants the public permission to use the licensed material\n    under specified terms and conditions. If the licensor's permission\n    is not necessary for any reason\u2013for example, because of any\n    applicable exception or limitation to copyright\u2013then that use is not\n    regulated by the license. Our licenses grant only permissions under\n    copyright and certain other rights that a licensor has authority to\n    grant. Use of the licensed material may still be restricted for\n    other reasons, including because others have copyright or other\n    rights in the material. A licensor may make special requests, such\n    as asking that all changes be marked or described. Although not\n    required by our licenses, you are encouraged to respect those\n    requests where reasonable. More considerations for the public :\n    wiki.creativecommons.org/Considerations\\_for\\_licensees\n\nCreative Commons Attribution-NonCommercial 4.0 International Public\nLicense\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial 4.0 International Public License (\"Public\nLicense\"). To the extent this Public License may be interpreted as a\ncontract, You are granted the Licensed Rights in consideration of Your\nacceptance of these terms and conditions, and the Licensor grants You\nsuch rights in consideration of benefits the Licensor receives from\nmaking the Licensed Material available under these terms and conditions.\n\n-   Section 1 \u2013 Definitions.\n\n    -   a. Adapted Material means material subject to Copyright and\n        Similar Rights that is derived from or based upon the Licensed\n        Material and in which the Licensed Material is translated,\n        altered, arranged, transformed, or otherwise modified in a\n        manner requiring permission under the Copyright and Similar\n        Rights held by the Licensor. For purposes of this Public\n        License, where the Licensed Material is a musical work,\n        performance, or sound recording, Adapted Material is always\n        produced where the Licensed Material is synched in timed\n        relation with a moving image.\n    -   b. Adapter's License means the license You apply to Your\n        Copyright and Similar Rights in Your contributions to Adapted\n        Material in accordance with the terms and conditions of this\n        Public License.\n    -   c. Copyright and Similar Rights means copyright and/or similar\n        rights closely related to copyright including, without\n        limitation, performance, broadcast, sound recording, and Sui\n        Generis Database Rights, without regard to how the rights are\n        labeled or categorized. For purposes of this Public License, the\n        rights specified in Section 2(b)(1)-(2) are not Copyright and\n        Similar Rights.\n    -   d. Effective Technological Measures means those measures that,\n        in the absence of proper authority, may not be circumvented\n        under laws fulfilling obligations under Article 11 of the WIPO\n        Copyright Treaty adopted on December 20, 1996, and/or similar\n        international agreements.\n    -   e. Exceptions and Limitations means fair use, fair dealing,\n        and/or any other exception or limitation to Copyright and\n        Similar Rights that applies to Your use of the Licensed\n        Material.\n    -   f. Licensed Material means the artistic or literary work,\n        database, or other material to which the Licensor applied this\n        Public License.\n    -   g. Licensed Rights means the rights granted to You subject to\n        the terms and conditions of this Public License, which are\n        limited to all Copyright and Similar Rights that apply to Your\n        use of the Licensed Material and that the Licensor has authority\n        to license.\n    -   h. Licensor means the individual(s) or entity(ies) granting\n        rights under this Public License.\n    -   i. NonCommercial means not primarily intended for or directed\n        towards commercial advantage or monetary compensation. For\n        purposes of this Public License, the exchange of the Licensed\n        Material for other material subject to Copyright and Similar\n        Rights by digital file-sharing or similar means is NonCommercial\n        provided there is no payment of monetary compensation in\n        connection with the exchange.\n    -   j. Share means to provide material to the public by any means or\n        process that requires permission under the Licensed Rights, such\n        as reproduction, public display, public performance,\n        distribution, dissemination, communication, or importation, and\n        to make material available to the public including in ways that\n        members of the public may access the material from a place and\n        at a time individually chosen by them.\n    -   k. Sui Generis Database Rights means rights other than copyright\n        resulting from Directive 96/9/EC of the European Parliament and\n        of the Council of 11 March 1996 on the legal protection of\n        databases, as amended and/or succeeded, as well as other\n        essentially equivalent rights anywhere in the world.\n    -   l. You means the individual or entity exercising the Licensed\n        Rights under this Public License. Your has a corresponding\n        meaning.\n\n-   Section 2 \u2013 Scope.\n\n    -   a. License grant.\n        -   1. Subject to the terms and conditions of this Public\n            License, the Licensor hereby grants You a worldwide,\n            royalty-free, non-sublicensable, non-exclusive, irrevocable\n            license to exercise the Licensed Rights in the Licensed\n            Material to:\n            -   A. reproduce and Share the Licensed Material, in whole\n                or in part, for NonCommercial purposes only; and\n            -   B. produce, reproduce, and Share Adapted Material for\n                NonCommercial purposes only.\n        -   2. Exceptions and Limitations. For the avoidance of doubt,\n            where Exceptions and Limitations apply to Your use, this\n            Public License does not apply, and You do not need to comply\n            with its terms and conditions.\n        -   3. Term. The term of this Public License is specified in\n            Section 6(a).\n        -   4. Media and formats; technical modifications allowed. The\n            Licensor authorizes You to exercise the Licensed Rights in\n            all media and formats whether now known or hereafter\n            created, and to make technical modifications necessary to do\n            so. The Licensor waives and/or agrees not to assert any\n            right or authority to forbid You from making technical\n            modifications necessary to exercise the Licensed Rights,\n            including technical modifications necessary to circumvent\n            Effective Technological Measures. For purposes of this\n            Public License, simply making modifications authorized by\n            this Section 2(a)(4) never produces Adapted Material.\n        -   5. Downstream recipients.\n            -   A. Offer from the Licensor \u2013 Licensed Material. Every\n                recipient of the Licensed Material automatically\n                receives an offer from the Licensor to exercise the\n                Licensed Rights under the terms and conditions of this\n                Public License.\n            -   B. No downstream restrictions. You may not offer or\n                impose any additional or different terms or conditions\n                on, or apply any Effective Technological Measures to,\n                the Licensed Material if doing so restricts exercise of\n                the Licensed Rights by any recipient of the Licensed\n                Material.\n        -   6. No endorsement. Nothing in this Public License\n            constitutes or may be construed as permission to assert or\n            imply that You are, or that Your use of the Licensed\n            Material is, connected with, or sponsored, endorsed, or\n            granted official status by, the Licensor or others\n            designated to receive attribution as provided in Section\n            3(a)(1)(A)(i).\n    -   b. Other rights.\n        -   1. Moral rights, such as the right of integrity, are not\n            licensed under this Public License, nor are publicity,\n            privacy, and/or other similar personality rights; however,\n            to the extent possible, the Licensor waives and/or agrees\n            not to assert any such rights held by the Licensor to the\n            limited extent necessary to allow You to exercise the\n            Licensed Rights, but not otherwise.\n        -   2. Patent and trademark rights are not licensed under this\n            Public License.\n        -   3. To the extent possible, the Licensor waives any right to\n            collect royalties from You for the exercise of the Licensed\n            Rights, whether directly or through a collecting society\n            under any voluntary or waivable statutory or compulsory\n            licensing scheme. In all other cases the Licensor expressly\n            reserves any right to collect such royalties, including when\n            the Licensed Material is used other than for NonCommercial\n            purposes.\n\n-   Section 3 \u2013 License Conditions.\n\n    Your exercise of the Licensed Rights is expressly made subject to\n    the following conditions.\n\n    -   a. Attribution.\n        -   1. If You Share the Licensed Material (including in modified\n            form), You must:\n            -   A. retain the following if it is supplied by the\n                Licensor with the Licensed Material:\n                -   i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if designated);\n                -   ii. a copyright notice;\n                -   iii. a notice that refers to this Public License;\n                -   iv. a notice that refers to the disclaimer of\n                    warranties;\n                -   v. a URI or hyperlink to the Licensed Material to\n                    the extent reasonably practicable;\n            -   B. indicate if You modified the Licensed Material and\n                retain an indication of any previous modifications; and\n            -   C. indicate the Licensed Material is licensed under this\n                Public License, and include the text of, or the URI or\n                hyperlink to, this Public License.\n        -   2. You may satisfy the conditions in Section 3(a)(1) in any\n            reasonable manner based on the medium, means, and context in\n            which You Share the Licensed Material. For example, it may\n            be reasonable to satisfy the conditions by providing a URI\n            or hyperlink to a resource that includes the required\n            information.\n        -   3. If requested by the Licensor, You must remove any of the\n            information required by Section 3(a)(1)(A) to the extent\n            reasonably practicable.\n        -   4. If You Share Adapted Material You produce, the Adapter's\n            License You apply must not prevent recipients of the Adapted\n            Material from complying with this Public License.\n\n-   Section 4 \u2013 Sui Generis Database Rights.\n\n    Where the Licensed Rights include Sui Generis Database Rights that\n    apply to Your use of the Licensed Material:\n\n    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the\n        right to extract, reuse, reproduce, and Share all or a\n        substantial portion of the contents of the database for\n        NonCommercial purposes only;\n    -   b. if You include all or a substantial portion of the database\n        contents in a database in which You have Sui Generis Database\n        Rights, then the database in which You have Sui Generis Database\n        Rights (but not its individual contents) is Adapted Material;\n        and\n    -   c. You must comply with the conditions in Section 3(a) if You\n        Share all or a substantial portion of the contents of the\n        database.\n\n    For the avoidance of doubt, this Section 4 supplements and does not\n    replace Your obligations under this Public License where the\n    Licensed Rights include other Copyright and Similar Rights.\n\n-   Section 5 \u2013 Disclaimer of Warranties and Limitation of Liability.\n\n    -   a. Unless otherwise separately undertaken by the Licensor, to\n        the extent possible, the Licensor offers the Licensed Material\n        as-is and as-available, and makes no representations or\n        warranties of any kind concerning the Licensed Material, whether\n        express, implied, statutory, or other. This includes, without\n        limitation, warranties of title, merchantability, fitness for a\n        particular purpose, non-infringement, absence of latent or other\n        defects, accuracy, or the presence or absence of errors, whether\n        or not known or discoverable. Where disclaimers of warranties\n        are not allowed in full or in part, this disclaimer may not\n        apply to You.\n    -   b. To the extent possible, in no event will the Licensor be\n        liable to You on any legal theory (including, without\n        limitation, negligence) or otherwise for any direct, special,\n        indirect, incidental, consequential, punitive, exemplary, or\n        other losses, costs, expenses, or damages arising out of this\n        Public License or use of the Licensed Material, even if the\n        Licensor has been advised of the possibility of such losses,\n        costs, expenses, or damages. Where a limitation of liability is\n        not allowed in full or in part, this limitation may not apply to\n        You.\n    -   c. The disclaimer of warranties and limitation of liability\n        provided above shall be interpreted in a manner that, to the\n        extent possible, most closely approximates an absolute\n        disclaimer and waiver of all liability.\n\n-   Section 6 \u2013 Term and Termination.\n\n    -   a. This Public License applies for the term of the Copyright and\n        Similar Rights licensed here. However, if You fail to comply\n        with this Public License, then Your rights under this Public\n        License terminate automatically.\n    -   b. Where Your right to use the Licensed Material has terminated\n        under Section 6(a), it reinstates:\n\n        -   1. automatically as of the date the violation is cured,\n            provided it is cured within 30 days of Your discovery of the\n            violation; or\n        -   2. upon express reinstatement by the Licensor.\n\n        For the avoidance of doubt, this Section 6(b) does not affect\n        any right the Licensor may have to seek remedies for Your\n        violations of this Public License.\n\n    -   c. For the avoidance of doubt, the Licensor may also offer the\n        Licensed Material under separate terms or conditions or stop\n        distributing the Licensed Material at any time; however, doing\n        so will not terminate this Public License.\n    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n        License.\n\n-   Section 7 \u2013 Other Terms and Conditions.\n\n    -   a. The Licensor shall not be bound by any additional or\n        different terms or conditions communicated by You unless\n        expressly agreed.\n    -   b. Any arrangements, understandings, or agreements regarding the\n        Licensed Material not stated herein are separate from and\n        independent of the terms and conditions of this Public License.\n\n-   Section 8 \u2013 Interpretation.\n\n    -   a. For the avoidance of doubt, this Public License does not, and\n        shall not be interpreted to, reduce, limit, restrict, or impose\n        conditions on any use of the Licensed Material that could\n        lawfully be made without permission under this Public License.\n    -   b. To the extent possible, if any provision of this Public\n        License is deemed unenforceable, it shall be automatically\n        reformed to the minimum extent necessary to make it enforceable.\n        If the provision cannot be reformed, it shall be severed from\n        this Public License without affecting the enforceability of the\n        remaining terms and conditions.\n    -   c. No term or condition of this Public License will be waived\n        and no failure to comply consented to unless expressly agreed to\n        by the Licensor.\n    -   d. Nothing in this Public License constitutes or may be\n        interpreted as a limitation upon, or waiver of, any privileges\n        and immunities that apply to the Licensor or You, including from\n        the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding,\nCreative Commons may elect to apply one of its public licenses to\nmaterial it publishes and in those instances will be considered the\n\"Licensor.\" The text of the Creative Commons public licenses is\ndedicated to the public domain under the CC0 Public Domain Dedication.\nExcept for the limited purpose of indicating that material is shared\nunder a Creative Commons public license or as otherwise permitted by the\nCreative Commons policies published at creativecommons.org/policies,\nCreative Commons does not authorize the use of the trademark \"Creative\nCommons\" or any other trademark or logo of Creative Commons without its\nprior written consent including, without limitation, in connection with\nany unauthorized modifications to any of its public licenses or any\nother arrangements, understandings, or agreements concerning use of\nlicensed material. For the avoidance of doubt, this paragraph does not\nform part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n
    "},{"location":"normalization_e0s.html","title":"Overview of QM Methods and Normalization","text":"

    OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize the usage of different level of theories used for Quantum Mechanics Single Point Calculations to add value and information to the datasets.

    "},{"location":"normalization_e0s.html#level-of-theory","title":"Level of Theory","text":"

    To avoid inconsistencies, level of theories are standardized and categorized into Python Enums consisting of a functional, a basis set, and a correction method. OpenQDC covers more than 106 functionals, 20 basis sets, and 11 correction methods. OpenQDC provides the computed the isolated atom energies e0 for each QM method.

    "},{"location":"normalization_e0s.html#normalization","title":"Normalization","text":"

    We provide support of energies through \"physical\" and \"regression\" normalization to conserve the size extensivity of chemical systems. OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies e0 physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- compute the average contribution of each atom species to potential energy via linear or ridge regression, centering the distribution at 0 and providing uncertainty estimation for the computed values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.

    "},{"location":"normalization_e0s.html#physical-normalization","title":"Physical Normalization","text":"

    e0 energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from the potential energy to obtain the atomization energy. This normalization method is physically interpretable and only remove the atom energy contribution from the potential energy.

    "},{"location":"normalization_e0s.html#regression-normalization","title":"Regression Normalization","text":"

    e0 energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. The e0 energies are then subtracted from the potential energy to obtain the atomization energy. This normalization provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. The resulting formation energy is centered at 0.

    "},{"location":"usage.html","title":"Usage","text":""},{"location":"usage.html#how-to-use","title":"How to use","text":"

    OpenQDC has been designed to be used with a single import:

    import openqdc as qdc\ndataset = qdc.QM9()\n

    All openQDC functions are available under qdc. Or if you want to directly import a specific dataset:

    from openqdc as Spice\n# Spice dataset with distance unit in angstrom instead of bohr\ndataset = Spice(distance_unit=\"ang\",\n                array_format = \"jax\"\n)\ndataset[0] # dict of jax array\n

    Or if you prefer handling ase.Atoms objects:

    dataset.get_ase_atoms(0)\n
    "},{"location":"usage.html#iterators","title":"Iterators","text":"

    OpenQDC provides a simple way to get the data as iterators:

    for data in dataset.as_iter(atoms=True):\n    print(data) # Atoms object\n    break\n
    "},{"location":"usage.html#lazy-loading","title":"Lazy loading","text":"

    OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during import openqdc as qdc. In case of trouble you can always disable lazy loading by setting the environment variable OPENQDC_DISABLE_LAZY_LOADING to 1.

    "},{"location":"API/basedataset.html","title":"Main class","text":"

    The BaseDataset defining shared functionality between all datasets.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset","title":"BaseDataset","text":"

    Bases: DatasetPropertyMixIn

    Base class for datasets in the openQDC package.

    Source code in openqdc/datasets/base.py
    class BaseDataset(DatasetPropertyMixIn):\n    \"\"\"\n    Base class for datasets in the openQDC package.\n    \"\"\"\n\n    energy_target_names = []\n    force_target_names = []\n    read_as_zarr = False\n    __energy_methods__ = []\n    __force_mask__ = []\n    __isolated_atom_energies__ = []\n    _fn_energy = lambda x: x\n    _fn_distance = lambda x: x\n    _fn_forces = lambda x: x\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __average_nb_atoms__ = None\n    __links__ = {}\n\n    def __init__(\n        self,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n        array_format: str = \"numpy\",\n        energy_type: Optional[str] = \"formation\",\n        overwrite_local_cache: bool = False,\n        cache_dir: Optional[str] = None,\n        recompute_statistics: bool = False,\n        transform: Optional[Callable] = None,\n        skip_statistics: bool = False,\n        read_as_zarr: bool = False,\n        regressor_kwargs: Dict = {\n            \"solver_type\": \"linear\",\n            \"sub_sample\": None,\n            \"stride\": 1,\n        },\n    ) -> None:\n        \"\"\"\n\n        Parameters:\n            energy_unit:\n                Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n            distance_unit:\n                Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n            array_format:\n                Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n            energy_type:\n                Type of isolated atom energy to use for the dataset. Default: \"formation\"\n                Supported types: [\"formation\", \"regression\", \"null\", None]\n            overwrite_local_cache:\n                Whether to overwrite the locally cached dataset.\n            cache_dir:\n                Cache directory location. Defaults to \"~/.cache/openqdc\"\n            recompute_statistics:\n                Whether to recompute the statistics of the dataset.\n            transform:\n                transformation to apply to the __getitem__ calls\n            regressor_kwargs:\n                Dictionary of keyword arguments to pass to the regressor.\n                Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n                solver_type can be one of [\"linear\", \"ridge\"]\n        \"\"\"\n        set_cache_dir(cache_dir)\n        # self._init_lambda_fn()\n        self.data = None\n        self._original_unit = self.energy_unit\n        self.recompute_statistics = recompute_statistics\n        self.regressor_kwargs = regressor_kwargs\n        self.transform = transform\n        self.read_as_zarr = read_as_zarr\n        self.energy_type = energy_type if energy_type is not None else \"null\"\n        self.refit_e0s = recompute_statistics or overwrite_local_cache\n        self.skip_statistics = skip_statistics\n        if not self.is_preprocessed():\n            raise DatasetNotAvailableError(self.__name__)\n        else:\n            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n        self.set_array_format(array_format)\n        self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n\n    def _init_lambda_fn(self):\n        self._fn_energy = lambda x: x\n        self._fn_distance = lambda x: x\n        self._fn_forces = lambda x: x\n\n    @property\n    def dataset_wrapper(self):\n        if not hasattr(self, \"_dataset_wrapper\"):\n            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()\n        return self._dataset_wrapper\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=self.__name__, links=self.__links__)\n\n    @classmethod\n    def fetch(cls, cache_path: Optional[str] = None, overwrite: bool = False) -> None:\n        from openqdc.utils.download_api import DataDownloader\n\n        DataDownloader(cache_path, overwrite).from_config(cls.no_init().config)\n\n    def _post_init(\n        self,\n        overwrite_local_cache: bool = False,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n    ) -> None:\n        self._set_units(None, None)\n        self._set_isolated_atom_energies()\n        if not self.skip_statistics:\n            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)\n        self._set_units(energy_unit, distance_unit)\n        self._convert_data()\n        self._set_isolated_atom_energies()\n\n    def _precompute_statistics(self, overwrite_local_cache: bool = False):\n        # if self.recompute_statistics or overwrite_local_cache:\n        self.statistics = StatisticManager(\n            self,\n            self.recompute_statistics or overwrite_local_cache,  # check if we need to recompute\n            # Add the common statistics (Forces, TotalE, FormE, PerAtomE)\n            ForcesCalculatorStats,\n            TotalEnergyStats,\n            FormationEnergyStats,\n            PerAtomFormationEnergyStats,\n        )\n        self.statistics.run_calculators()  # run the calculators\n        self._compute_average_nb_atoms()\n\n    @classmethod\n    def no_init(cls):\n        \"\"\"\n        Class method to avoid the __init__ method to be called when the class is instanciated.\n        Useful for debugging purposes or preprocessing data.\n        \"\"\"\n        return cls.__new__(cls)\n\n    @property\n    def __force_methods__(self):\n        \"\"\"\n        For backward compatibility. To be removed in the future.\n        \"\"\"\n        return self.force_methods\n\n    @property\n    def energy_methods(self) -> List[str]:\n        \"\"\"Return the string version of the energy methods\"\"\"\n        return [str(i) for i in self.__energy_methods__]\n\n    @property\n    def force_mask(self):\n        if len(self.__class__.__force_mask__) == 0:\n            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)\n        return self.__class__.__force_mask__\n\n    @property\n    def force_methods(self):\n        return list(compress(self.energy_methods, self.force_mask))\n\n    @property\n    def e0s_dispatcher(self):\n        if not hasattr(self, \"_e0s_dispatcher\"):\n            # Automatically fetch/compute formation or regression energies\n            self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)\n        return self._e0s_dispatcher\n\n    def _convert_data(self):\n        logger.info(\n            f\"Converting {self.__name__} data to the following units:\\n\\\n                     Energy: {str(self.energy_unit)},\\n\\\n                     Distance: {str(self.distance_unit)},\\n\\\n                     Forces: {str(self.force_unit) if self.__force_methods__ else 'None'}\"\n        )\n        for key in self.data_keys:\n            self.data[key] = self._convert_on_loading(self.data[key], key)\n\n    @property\n    def energy_unit(self):\n        return EnergyTypeConversion(self.__energy_unit__)\n\n    @property\n    def distance_unit(self):\n        return DistanceTypeConversion(self.__distance_unit__)\n\n    @property\n    def force_unit(self):\n        units = self.__forces_unit__.split(\"/\")\n        if len(units) > 2:\n            units = [\"/\".join(units[:2]), units[-1]]\n        return ForceTypeConversion(tuple(units))  # < 3.12 compatibility\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), self.__name__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def data_keys(self):\n        keys = list(self.data_types.keys())\n        if len(self.__force_methods__) == 0:\n            keys.remove(\"forces\")\n        return keys\n\n    @property\n    def pkl_data_keys(self):\n        return list(self.pkl_data_types.keys())\n\n    @property\n    def pkl_data_types(self):\n        return {\"name\": str, \"subset\": str, \"n_atoms\": np.int32}\n\n    @property\n    def atom_energies(self):\n        return self._e0s_dispatcher\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float64,\n            \"forces\": np.float32,\n        }\n\n    @property\n    def data_shapes(self):\n        return {\n            \"atomic_inputs\": (-1, NB_ATOMIC_FEATURES),\n            \"position_idx_range\": (-1, 2),\n            \"energies\": (-1, len(self.energy_methods)),\n            \"forces\": (-1, 3, len(self.force_methods)),\n        }\n\n    def _set_units(self, en: Optional[str] = None, ds: Optional[str] = None):\n        old_en, old_ds = self.energy_unit, self.distance_unit\n        en = en if en is not None else old_en\n        ds = ds if ds is not None else old_ds\n        self.set_energy_unit(en)\n        self.set_distance_unit(ds)\n        if self.__force_methods__:\n            self._fn_forces = self.force_unit.to(str(self.energy_unit), str(self.distance_unit))\n            self.__forces_unit__ = str(self.energy_unit) + \"/\" + str(self.distance_unit)\n\n    def _set_isolated_atom_energies(self):\n        if self.__energy_methods__ is None:\n            logger.error(\"No energy methods defined for this dataset.\")\n        if self.energy_type == \"formation\":\n            f = get_conversion(\"hartree\", self.__energy_unit__)\n        else:\n            # regression are calculated on the original unit of the dataset\n            f = self._original_unit.to(self.energy_unit)\n        self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix)\n\n    def convert_energy(self, x):\n        return self._fn_energy(x)\n\n    def convert_distance(self, x):\n        return self._fn_distance(x)\n\n    def convert_forces(self, x):\n        return self._fn_forces(x)\n\n    def set_energy_unit(self, value: str):\n        \"\"\"\n        Set a new energy unit for the dataset.\n\n        Parameters:\n            value:\n                New energy unit to set.\n        \"\"\"\n        # old_unit = self.energy_unit\n        # self.__energy_unit__ = value\n        self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n        self.__energy_unit__ = value\n\n    def set_distance_unit(self, value: str):\n        \"\"\"\n        Set a new distance unit for the dataset.\n\n        Parameters:\n            value:\n                New distance unit to set.\n        \"\"\"\n        # old_unit = self.distance_unit\n        # self.__distance_unit__ = value\n        self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n        self.__distance_unit__ = value\n\n    def set_array_format(self, format: str):\n        assert format in [\"numpy\", \"torch\", \"jax\"], f\"Format {format} not supported.\"\n        self.array_format = format\n\n    def read_raw_entries(self):\n        \"\"\"\n        Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n        \"\"\"\n        raise NotImplementedError\n\n    def collate_list(self, list_entries: List[Dict]) -> Dict:\n        \"\"\"\n        Collate a list of entries into a single dictionary.\n\n        Parameters:\n            list_entries:\n                List of dictionaries containing the entries to collate.\n\n        Returns:\n            Dictionary containing the collated entries.\n        \"\"\"\n        # concatenate entries\n        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n        csum = np.cumsum(res.get(\"n_atoms\"))\n        x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n        x[1:, 0], x[:, 1] = csum[:-1], csum\n        res[\"position_idx_range\"] = x\n\n        return res\n\n    def save_preprocess(\n        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n    ):\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n        Parameters:\n            data_dict:\n                Dictionary containing the preprocessed data.\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                Whether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n        \"\"\"\n        # save memmaps\n        logger.info(\"Preprocessing data and saving it to cache.\")\n        paths = self.dataset_wrapper.save_preprocess(\n            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n        )\n        if upload:\n            for local_path in paths:\n                push_remote(local_path, overwrite=overwrite)  # make it async?\n\n    def read_preprocess(self, overwrite_local_cache=False):\n        logger.info(\"Reading preprocessed data.\")\n        logger.info(\n            f\"Dataset {self.__name__} with the following units:\\n\\\n                     Energy: {self.energy_unit},\\n\\\n                     Distance: {self.distance_unit},\\n\\\n                     Forces: {self.force_unit if self.force_methods else 'None'}\"\n        )\n\n        self.data = self.dataset_wrapper.load_data(\n            self.preprocess_path,\n            self.data_keys,\n            self.data_types,\n            self.data_shapes,\n            self.pkl_data_keys,\n            overwrite_local_cache,\n        )  # this should be async if possible\n        for key in self.data:\n            logger.info(f\"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}\")\n\n    def _convert_on_loading(self, x, key):\n        if key == \"energies\":\n            return self.convert_energy(x)\n        elif key == \"forces\":\n            return self.convert_forces(x)\n        elif key == \"atomic_inputs\":\n            x = np.array(x, dtype=np.float32)\n            x[:, -3:] = self.convert_distance(x[:, -3:])\n            return x\n        else:\n            return x\n\n    def is_preprocessed(self) -> bool:\n        \"\"\"\n        Check if the dataset is preprocessed and available online or locally.\n\n        Returns:\n            True if the dataset is available remotely or locally, False otherwise.\n        \"\"\"\n        predicats = [\n            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def is_cached(self) -> bool:\n        \"\"\"\n        Check if the dataset is cached locally.\n\n        Returns:\n            True if the dataset is cached locally, False otherwise.\n        \"\"\"\n        predicats = [\n            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n        \"\"\"\n        Preprocess the dataset and save it.\n\n        Parameters:\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                hether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n            as_zarr:\n                Whether to save the data as zarr files\n        \"\"\"\n        if overwrite or not self.is_preprocessed():\n            entries = self.read_raw_entries()\n            res = self.collate_list(entries)\n            self.save_preprocess(res, upload, overwrite, as_zarr)\n\n    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n        \"\"\"\n        Upload the preprocessed data to the remote storage. Must be called after preprocess and\n        need to have write privileges.\n\n        Parameters:\n            overwrite:\n                Whether to overwrite the remote data if it already exists\n            as_zarr:\n                Whether to upload the data as zarr files\n        \"\"\"\n        for key in self.data_keys:\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n            push_remote(local_path, overwrite=overwrite)\n        local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n        push_remote(local_path, overwrite=overwrite)\n\n    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n        \"\"\"\n        Save a single entry at index idx as an extxyz file.\n\n        Parameters:\n            idx:\n                Index of the entry\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file. If None, the current working directory is used.\n            ext:\n                Whether to include additional informations like forces and other metadatas (extxyz format)\n        \"\"\"\n        if path is None:\n            path = os.getcwd()\n        at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n        write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n\n    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n        \"\"\"\n        Save dataset as single xyz file (extended xyz format).\n\n        Parameters:\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file\n        \"\"\"\n        with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n            for atoms in tqdm(\n                self.as_iter(atoms=True, energy_method=energy_method),\n                total=len(self),\n                desc=f\"Saving {self.__name__} as xyz file\",\n            ):\n                write_extxyz(f, atoms, append=True)\n\n    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n        \"\"\"\n        Get the ASE atoms object for the entry at index idx.\n\n        Parameters:\n            idx:\n                Index of the entry.\n            energy_method:\n                Index of the energy method to use\n            ext:\n                Whether to include additional informations\n\n        Returns:\n            ASE atoms object\n        \"\"\"\n        entry = self[idx]\n        at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n        return at\n\n    def subsample(\n        self, n_samples: Optional[Union[List[int], int, float]] = None, replace: bool = False, seed: int = 42\n    ):\n        np.random.seed(seed)\n        if n_samples is None:\n            return list(range(len(self)))\n        try:\n            if 0 < n_samples < 1:\n                n_samples = int(n_samples * len(self))\n            if isinstance(n_samples, int):\n                idxs = np.random.choice(len(self), size=n_samples, replace=replace)\n        except (ValueError, TypeError):  # list, set, np.ndarray\n            idxs = n_samples\n        return idxs\n\n    @requires_package(\"datamol\")\n    def calculate_descriptors(\n        self,\n        descriptor_name: str = \"soap\",\n        chemical_species: Optional[List[str]] = None,\n        n_samples: Optional[Union[List[int], int, float]] = None,\n        progress: bool = True,\n        **descriptor_kwargs,\n    ) -> Dict[str, np.ndarray]:\n        \"\"\"\n        Compute the descriptors for the dataset.\n\n        Parameters:\n            descriptor_name:\n                Name of the descriptor to use. Supported descriptors are [\"soap\"]\n            chemical_species:\n                List of chemical species to use for the descriptor computation, by default None.\n                If None, the chemical species of the dataset are used.\n            n_samples:\n                Number of samples to use for the computation, by default None.\n                If None, all the dataset is used.\n                If a list of integers is provided, the descriptors are computed for\n                each of the specified idx of samples.\n            progress:\n                Whether to show a progress bar, by default True.\n            **descriptor_kwargs : dict\n                Keyword arguments to pass to the descriptor instantiation of the model.\n\n        Returns:\n            Dictionary containing the following keys:\n                - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n                - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n        \"\"\"\n        import datamol as dm\n\n        datum = {}\n        idxs = self.subsample(n_samples)\n        model = get_descriptor(descriptor_name.lower())(\n            species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n        )\n\n        def wrapper(idx):\n            entry = self.get_ase_atoms(idx, ext=False)\n            return model.calculate(entry)\n\n        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n        datum[\"values\"] = np.vstack(descr)\n        datum[\"idxs\"] = idxs\n        return datum\n\n    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n        \"\"\"\n        Return the dataset as an iterator.\n\n        Parameters:\n            atoms:\n                Whether to return the items as ASE atoms object, by default False\n            energy_method:\n                Index of the energy method to use\n\n        Returns:\n            Iterator of the dataset\n        \"\"\"\n\n        func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n        for i in range(len(self)):\n            yield func(i)\n\n    def __iter__(self):\n        for idxs in range(len(self)):\n            yield self[idxs]\n\n    def get_statistics(self, return_none: bool = True) -> Dict:\n        \"\"\"\n        Get the converted statistics of the dataset.\n\n        Parameters:\n            return_none :\n                Whether to return None if the statistics for the forces are not available, by default True\n                Otherwise, the statistics for the forces are set to 0.0\n\n        Returns:\n            Dictionary containing the statistics of the dataset\n        \"\"\"\n        selected_stats = self.statistics.get_results()\n        if len(selected_stats) == 0:\n            raise StatisticsNotAvailableError(self.__name__)\n        if not return_none:\n            selected_stats.update(\n                {\n                    \"ForcesCalculatorStats\": {\n                        \"mean\": np.array([0.0]),\n                        \"std\": np.array([0.0]),\n                        \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                    }\n                }\n            )\n        # cycle trough dict to convert units\n        for key, result in selected_stats.items():\n            if isinstance(result, ForcesCalculatorStats):\n                result.transform(self.convert_forces)\n            else:\n                result.transform(self.convert_energy)\n            result.transform(self._convert_array)\n        return {k: result.to_dict() for k, result in selected_stats.items()}\n\n    def __str__(self):\n        return f\"{self.__name__}\"\n\n    def __repr__(self):\n        return f\"{self.__name__}\"\n\n    def __len__(self):\n        return self.data[\"energies\"].shape[0]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n\n    def _convert_array(self, x: np.ndarray):\n        return _CONVERT_DICT.get(self.array_format)(x)\n\n    def __getitem__(self, idx: int):\n        shift = MAX_CHARGE\n        p_start, p_end = self.data[\"position_idx_range\"][idx]\n        input = self.data[\"atomic_inputs\"][p_start:p_end]\n        z, c, positions, energies = (\n            self._convert_array(np.array(input[:, 0], dtype=np.int32)),\n            self._convert_array(np.array(input[:, 1], dtype=np.int32)),\n            self._convert_array(np.array(input[:, -3:], dtype=np.float32)),\n            self._convert_array(np.array(self.data[\"energies\"][idx], dtype=np.float64)),\n        )\n        name = self.__smiles_converter__(self.data[\"name\"][idx])\n        subset = self.data[\"subset\"][idx]\n        e0s = self._convert_array(self.__isolated_atom_energies__[..., z, c + shift].T)\n        formation_energies = energies - e0s.sum(axis=0)\n        forces = None\n        if \"forces\" in self.data:\n            forces = self._convert_array(np.array(self.data[\"forces\"][p_start:p_end], dtype=np.float32))\n\n        bunch = Bunch(\n            positions=positions,\n            atomic_numbers=z,\n            charges=c,\n            e0=e0s,\n            energies=energies,\n            formation_energies=formation_energies,\n            per_atom_formation_energies=formation_energies / len(z),\n            name=name,\n            subset=subset,\n            forces=forces,\n        )\n\n        if self.transform is not None:\n            bunch = self.transform(bunch)\n\n        return bunch\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__force_methods__","title":"__force_methods__ property","text":"

    For backward compatibility. To be removed in the future.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.energy_methods","title":"energy_methods: List[str] property","text":"

    Return the string version of the energy methods

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__init__","title":"__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})","text":"

    Parameters:

    Name Type Description Default energy_unit Optional[str]

    Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    None distance_unit Optional[str]

    Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]

    None array_format str

    Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]

    'numpy' energy_type Optional[str]

    Type of isolated atom energy to use for the dataset. Default: \"formation\" Supported types: [\"formation\", \"regression\", \"null\", None]

    'formation' overwrite_local_cache bool

    Whether to overwrite the locally cached dataset.

    False cache_dir Optional[str]

    Cache directory location. Defaults to \"~/.cache/openqdc\"

    None recompute_statistics bool

    Whether to recompute the statistics of the dataset.

    False transform Optional[Callable]

    transformation to apply to the getitem calls

    None regressor_kwargs Dict

    Dictionary of keyword arguments to pass to the regressor. Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1} solver_type can be one of [\"linear\", \"ridge\"]

    {'solver_type': 'linear', 'sub_sample': None, 'stride': 1} Source code in openqdc/datasets/base.py
    def __init__(\n    self,\n    energy_unit: Optional[str] = None,\n    distance_unit: Optional[str] = None,\n    array_format: str = \"numpy\",\n    energy_type: Optional[str] = \"formation\",\n    overwrite_local_cache: bool = False,\n    cache_dir: Optional[str] = None,\n    recompute_statistics: bool = False,\n    transform: Optional[Callable] = None,\n    skip_statistics: bool = False,\n    read_as_zarr: bool = False,\n    regressor_kwargs: Dict = {\n        \"solver_type\": \"linear\",\n        \"sub_sample\": None,\n        \"stride\": 1,\n    },\n) -> None:\n    \"\"\"\n\n    Parameters:\n        energy_unit:\n            Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n        distance_unit:\n            Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n        array_format:\n            Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n        energy_type:\n            Type of isolated atom energy to use for the dataset. Default: \"formation\"\n            Supported types: [\"formation\", \"regression\", \"null\", None]\n        overwrite_local_cache:\n            Whether to overwrite the locally cached dataset.\n        cache_dir:\n            Cache directory location. Defaults to \"~/.cache/openqdc\"\n        recompute_statistics:\n            Whether to recompute the statistics of the dataset.\n        transform:\n            transformation to apply to the __getitem__ calls\n        regressor_kwargs:\n            Dictionary of keyword arguments to pass to the regressor.\n            Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n            solver_type can be one of [\"linear\", \"ridge\"]\n    \"\"\"\n    set_cache_dir(cache_dir)\n    # self._init_lambda_fn()\n    self.data = None\n    self._original_unit = self.energy_unit\n    self.recompute_statistics = recompute_statistics\n    self.regressor_kwargs = regressor_kwargs\n    self.transform = transform\n    self.read_as_zarr = read_as_zarr\n    self.energy_type = energy_type if energy_type is not None else \"null\"\n    self.refit_e0s = recompute_statistics or overwrite_local_cache\n    self.skip_statistics = skip_statistics\n    if not self.is_preprocessed():\n        raise DatasetNotAvailableError(self.__name__)\n    else:\n        self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n    self.set_array_format(array_format)\n    self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/base.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.as_iter","title":"as_iter(atoms=False, energy_method=0)","text":"

    Return the dataset as an iterator.

    Parameters:

    Name Type Description Default atoms bool

    Whether to return the items as ASE atoms object, by default False

    False energy_method int

    Index of the energy method to use

    0

    Returns:

    Type Description Iterable

    Iterator of the dataset

    Source code in openqdc/datasets/base.py
    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n    \"\"\"\n    Return the dataset as an iterator.\n\n    Parameters:\n        atoms:\n            Whether to return the items as ASE atoms object, by default False\n        energy_method:\n            Index of the energy method to use\n\n    Returns:\n        Iterator of the dataset\n    \"\"\"\n\n    func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n    for i in range(len(self)):\n        yield func(i)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.calculate_descriptors","title":"calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)","text":"

    Compute the descriptors for the dataset.

    Parameters:

    Name Type Description Default descriptor_name str

    Name of the descriptor to use. Supported descriptors are [\"soap\"]

    'soap' chemical_species Optional[List[str]]

    List of chemical species to use for the descriptor computation, by default None. If None, the chemical species of the dataset are used.

    None n_samples Optional[Union[List[int], int, float]]

    Number of samples to use for the computation, by default None. If None, all the dataset is used. If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.

    None progress bool

    Whether to show a progress bar, by default True.

    True **descriptor_kwargs

    dict Keyword arguments to pass to the descriptor instantiation of the model.

    {}

    Returns:

    Type Description Dict[str, ndarray]

    Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used

    Source code in openqdc/datasets/base.py
    @requires_package(\"datamol\")\ndef calculate_descriptors(\n    self,\n    descriptor_name: str = \"soap\",\n    chemical_species: Optional[List[str]] = None,\n    n_samples: Optional[Union[List[int], int, float]] = None,\n    progress: bool = True,\n    **descriptor_kwargs,\n) -> Dict[str, np.ndarray]:\n    \"\"\"\n    Compute the descriptors for the dataset.\n\n    Parameters:\n        descriptor_name:\n            Name of the descriptor to use. Supported descriptors are [\"soap\"]\n        chemical_species:\n            List of chemical species to use for the descriptor computation, by default None.\n            If None, the chemical species of the dataset are used.\n        n_samples:\n            Number of samples to use for the computation, by default None.\n            If None, all the dataset is used.\n            If a list of integers is provided, the descriptors are computed for\n            each of the specified idx of samples.\n        progress:\n            Whether to show a progress bar, by default True.\n        **descriptor_kwargs : dict\n            Keyword arguments to pass to the descriptor instantiation of the model.\n\n    Returns:\n        Dictionary containing the following keys:\n            - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n            - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n    \"\"\"\n    import datamol as dm\n\n    datum = {}\n    idxs = self.subsample(n_samples)\n    model = get_descriptor(descriptor_name.lower())(\n        species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n    )\n\n    def wrapper(idx):\n        entry = self.get_ase_atoms(idx, ext=False)\n        return model.calculate(entry)\n\n    descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n    datum[\"values\"] = np.vstack(descr)\n    datum[\"idxs\"] = idxs\n    return datum\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.collate_list","title":"collate_list(list_entries)","text":"

    Collate a list of entries into a single dictionary.

    Parameters:

    Name Type Description Default list_entries List[Dict]

    List of dictionaries containing the entries to collate.

    required

    Returns:

    Type Description Dict

    Dictionary containing the collated entries.

    Source code in openqdc/datasets/base.py
    def collate_list(self, list_entries: List[Dict]) -> Dict:\n    \"\"\"\n    Collate a list of entries into a single dictionary.\n\n    Parameters:\n        list_entries:\n            List of dictionaries containing the entries to collate.\n\n    Returns:\n        Dictionary containing the collated entries.\n    \"\"\"\n    # concatenate entries\n    res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n    csum = np.cumsum(res.get(\"n_atoms\"))\n    x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n    x[1:, 0], x[:, 1] = csum[:-1], csum\n    res[\"position_idx_range\"] = x\n\n    return res\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_ase_atoms","title":"get_ase_atoms(idx, energy_method=0, ext=True)","text":"

    Get the ASE atoms object for the entry at index idx.

    Parameters:

    Name Type Description Default idx int

    Index of the entry.

    required energy_method int

    Index of the energy method to use

    0 ext bool

    Whether to include additional informations

    True

    Returns:

    Type Description Atoms

    ASE atoms object

    Source code in openqdc/datasets/base.py
    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n    \"\"\"\n    Get the ASE atoms object for the entry at index idx.\n\n    Parameters:\n        idx:\n            Index of the entry.\n        energy_method:\n            Index of the energy method to use\n        ext:\n            Whether to include additional informations\n\n    Returns:\n        ASE atoms object\n    \"\"\"\n    entry = self[idx]\n    at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n    return at\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_statistics","title":"get_statistics(return_none=True)","text":"

    Get the converted statistics of the dataset.

    Parameters:

    Name Type Description Default return_none

    Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0

    True

    Returns:

    Type Description Dict

    Dictionary containing the statistics of the dataset

    Source code in openqdc/datasets/base.py
    def get_statistics(self, return_none: bool = True) -> Dict:\n    \"\"\"\n    Get the converted statistics of the dataset.\n\n    Parameters:\n        return_none :\n            Whether to return None if the statistics for the forces are not available, by default True\n            Otherwise, the statistics for the forces are set to 0.0\n\n    Returns:\n        Dictionary containing the statistics of the dataset\n    \"\"\"\n    selected_stats = self.statistics.get_results()\n    if len(selected_stats) == 0:\n        raise StatisticsNotAvailableError(self.__name__)\n    if not return_none:\n        selected_stats.update(\n            {\n                \"ForcesCalculatorStats\": {\n                    \"mean\": np.array([0.0]),\n                    \"std\": np.array([0.0]),\n                    \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                }\n            }\n        )\n    # cycle trough dict to convert units\n    for key, result in selected_stats.items():\n        if isinstance(result, ForcesCalculatorStats):\n            result.transform(self.convert_forces)\n        else:\n            result.transform(self.convert_energy)\n        result.transform(self._convert_array)\n    return {k: result.to_dict() for k, result in selected_stats.items()}\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_cached","title":"is_cached()","text":"

    Check if the dataset is cached locally.

    Returns:

    Type Description bool

    True if the dataset is cached locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_cached(self) -> bool:\n    \"\"\"\n    Check if the dataset is cached locally.\n\n    Returns:\n        True if the dataset is cached locally, False otherwise.\n    \"\"\"\n    predicats = [\n        os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_preprocessed","title":"is_preprocessed()","text":"

    Check if the dataset is preprocessed and available online or locally.

    Returns:

    Type Description bool

    True if the dataset is available remotely or locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_preprocessed(self) -> bool:\n    \"\"\"\n    Check if the dataset is preprocessed and available online or locally.\n\n    Returns:\n        True if the dataset is available remotely or locally, False otherwise.\n    \"\"\"\n    predicats = [\n        copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.no_init","title":"no_init() classmethod","text":"

    Class method to avoid the init method to be called when the class is instanciated. Useful for debugging purposes or preprocessing data.

    Source code in openqdc/datasets/base.py
    @classmethod\ndef no_init(cls):\n    \"\"\"\n    Class method to avoid the __init__ method to be called when the class is instanciated.\n    Useful for debugging purposes or preprocessing data.\n    \"\"\"\n    return cls.__new__(cls)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.preprocess","title":"preprocess(upload=False, overwrite=True, as_zarr=True)","text":"

    Preprocess the dataset and save it.

    Parameters:

    Name Type Description Default upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    hether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True as_zarr bool

    Whether to save the data as zarr files

    True Source code in openqdc/datasets/base.py
    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n    \"\"\"\n    Preprocess the dataset and save it.\n\n    Parameters:\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            hether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n        as_zarr:\n            Whether to save the data as zarr files\n    \"\"\"\n    if overwrite or not self.is_preprocessed():\n        entries = self.read_raw_entries()\n        res = self.collate_list(entries)\n        self.save_preprocess(res, upload, overwrite, as_zarr)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.read_raw_entries","title":"read_raw_entries()","text":"

    Preprocess the raw (aka from the fetched source) into a list of dictionaries.

    Source code in openqdc/datasets/base.py
    def read_raw_entries(self):\n    \"\"\"\n    Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_preprocess","title":"save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.

    Parameters:

    Name Type Description Default data_dict Dict[str, ndarray]

    Dictionary containing the preprocessed data.

    required upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    Whether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True Source code in openqdc/datasets/base.py
    def save_preprocess(\n    self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n):\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n    Parameters:\n        data_dict:\n            Dictionary containing the preprocessed data.\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            Whether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n    \"\"\"\n    # save memmaps\n    logger.info(\"Preprocessing data and saving it to cache.\")\n    paths = self.dataset_wrapper.save_preprocess(\n        self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n    )\n    if upload:\n        for local_path in paths:\n            push_remote(local_path, overwrite=overwrite)  # make it async?\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_xyz","title":"save_xyz(idx, energy_method=0, path=None, ext=True)","text":"

    Save a single entry at index idx as an extxyz file.

    Parameters:

    Name Type Description Default idx int

    Index of the entry

    required energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file. If None, the current working directory is used.

    None ext bool

    Whether to include additional informations like forces and other metadatas (extxyz format)

    True Source code in openqdc/datasets/base.py
    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n    \"\"\"\n    Save a single entry at index idx as an extxyz file.\n\n    Parameters:\n        idx:\n            Index of the entry\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file. If None, the current working directory is used.\n        ext:\n            Whether to include additional informations like forces and other metadatas (extxyz format)\n    \"\"\"\n    if path is None:\n        path = os.getcwd()\n    at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n    write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_distance_unit","title":"set_distance_unit(value)","text":"

    Set a new distance unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New distance unit to set.

    required Source code in openqdc/datasets/base.py
    def set_distance_unit(self, value: str):\n    \"\"\"\n    Set a new distance unit for the dataset.\n\n    Parameters:\n        value:\n            New distance unit to set.\n    \"\"\"\n    # old_unit = self.distance_unit\n    # self.__distance_unit__ = value\n    self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n    self.__distance_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_energy_unit","title":"set_energy_unit(value)","text":"

    Set a new energy unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New energy unit to set.

    required Source code in openqdc/datasets/base.py
    def set_energy_unit(self, value: str):\n    \"\"\"\n    Set a new energy unit for the dataset.\n\n    Parameters:\n        value:\n            New energy unit to set.\n    \"\"\"\n    # old_unit = self.energy_unit\n    # self.__energy_unit__ = value\n    self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n    self.__energy_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.to_xyz","title":"to_xyz(energy_method=0, path=None)","text":"

    Save dataset as single xyz file (extended xyz format).

    Parameters:

    Name Type Description Default energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file

    None Source code in openqdc/datasets/base.py
    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n    \"\"\"\n    Save dataset as single xyz file (extended xyz format).\n\n    Parameters:\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file\n    \"\"\"\n    with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n        for atoms in tqdm(\n            self.as_iter(atoms=True, energy_method=energy_method),\n            total=len(self),\n            desc=f\"Saving {self.__name__} as xyz file\",\n        ):\n            write_extxyz(f, atoms, append=True)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.upload","title":"upload(overwrite=False, as_zarr=False)","text":"

    Upload the preprocessed data to the remote storage. Must be called after preprocess and need to have write privileges.

    Parameters:

    Name Type Description Default overwrite bool

    Whether to overwrite the remote data if it already exists

    False as_zarr bool

    Whether to upload the data as zarr files

    False Source code in openqdc/datasets/base.py
    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n    \"\"\"\n    Upload the preprocessed data to the remote storage. Must be called after preprocess and\n    need to have write privileges.\n\n    Parameters:\n        overwrite:\n            Whether to overwrite the remote data if it already exists\n        as_zarr:\n            Whether to upload the data as zarr files\n    \"\"\"\n    for key in self.data_keys:\n        local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n        push_remote(local_path, overwrite=overwrite)\n    local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n    push_remote(local_path, overwrite=overwrite)\n
    "},{"location":"API/formats.html","title":"Format loading","text":""},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure","title":"GeneralStructure","text":"

    Bases: ABC

    Abstract Factory class for datasets type in the openQDC package.

    Source code in openqdc/datasets/structure.py
    class GeneralStructure(ABC):\n    \"\"\"\n    Abstract Factory class for datasets type in the openQDC package.\n    \"\"\"\n\n    _ext: Optional[str] = None\n    _extra_files: Optional[List[str]] = None\n\n    @property\n    def ext(self):\n        return self._ext\n\n    @property\n    @abstractmethod\n    def load_fn(self) -> Callable:\n        \"\"\"\n        Function to use for loading the data.\n        Must be implemented by the child class.\n\n        Returns:\n            the function to use for loading the data\n        \"\"\"\n        raise NotImplementedError\n\n    def add_extension(self, filename: str) -> str:\n        \"\"\"\n        Add the correct extension to a filename\n\n        Parameters:\n            filename:  the filename to add the extension to\n\n        Returns:\n            the filename with the extension\n        \"\"\"\n        return filename + self.ext\n\n    @abstractmethod\n    def save_preprocess(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_dict: Dict[str, np.ndarray],\n        extra_data_keys: List[str],\n        extra_data_types: Dict[str, type],\n    ) -> List[str]:\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n        Must be implemented by the child class.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_dict:        dictionary of data to save\n            extra_data_keys:  list of keys to load from the extra data file\n            extra_data_types: dictionary of data types for each key\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def load_extra_files(\n        self,\n        data: Dict[str, np.ndarray],\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        pkl_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Load extra files required to define other types of data.\n        Must be implemented by the child class.\n\n        Parameters:\n            data:  dictionary of data to load\n            preprocess_path:  path to the preprocessed data file\n            data_keys:    list of keys to load from the data file\n            pkl_data_keys:   list of keys to load from the extra files\n            overwrite:   whether to overwrite the local cache\n        \"\"\"\n        raise NotImplementedError\n\n    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n        \"\"\"\n        Join a path and a filename and add the correct extension.\n\n        Parameters:\n            path:  the path to join\n            filename:  the filename to join\n\n        Returns:\n            the joined path with the correct extension\n        \"\"\"\n        return p_join(path, self.add_extension(filename))\n\n    def load_data(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_types: Dict[str, np.dtype],\n        data_shapes: Dict[str, Tuple[int, int]],\n        extra_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Main method to load the data from a filetype structure like memmap or zarr.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_types:       dictionary of data types for each key\n            data_shapes:      dictionary of shapes for each key\n            extra_data_keys:  list of keys to load from the extra data file\n            overwrite:        whether to overwrite the local cache\n        \"\"\"\n        data = {}\n        for key in data_keys:\n            filename = self.join_and_ext(preprocess_path, key)\n            pull_locally(filename, overwrite=overwrite)\n            data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n            data[key] = self.unpack(data[key])\n            data[key] = data[key].reshape(*data_shapes[key])\n\n        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n        return data\n\n    def unpack(self, data: any) -> any:\n        \"\"\"\n        Unpack the data from the loaded file.\n\n        Parameters:\n            data:  the data to unpack\n\n        Returns:\n            the unpacked data\n        \"\"\"\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_fn","title":"load_fn: Callable abstractmethod property","text":"

    Function to use for loading the data. Must be implemented by the child class.

    Returns:

    Type Description Callable

    the function to use for loading the data

    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.add_extension","title":"add_extension(filename)","text":"

    Add the correct extension to a filename

    Parameters:

    Name Type Description Default filename str

    the filename to add the extension to

    required

    Returns:

    Type Description str

    the filename with the extension

    Source code in openqdc/datasets/structure.py
    def add_extension(self, filename: str) -> str:\n    \"\"\"\n    Add the correct extension to a filename\n\n    Parameters:\n        filename:  the filename to add the extension to\n\n    Returns:\n        the filename with the extension\n    \"\"\"\n    return filename + self.ext\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.join_and_ext","title":"join_and_ext(path, filename)","text":"

    Join a path and a filename and add the correct extension.

    Parameters:

    Name Type Description Default path Union[str, PathLike]

    the path to join

    required filename str

    the filename to join

    required

    Returns:

    Type Description Union[str, PathLike]

    the joined path with the correct extension

    Source code in openqdc/datasets/structure.py
    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n    \"\"\"\n    Join a path and a filename and add the correct extension.\n\n    Parameters:\n        path:  the path to join\n        filename:  the filename to join\n\n    Returns:\n        the joined path with the correct extension\n    \"\"\"\n    return p_join(path, self.add_extension(filename))\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_data","title":"load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)","text":"

    Main method to load the data from a filetype structure like memmap or zarr.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_types Dict[str, dtype]

    dictionary of data types for each key

    required data_shapes Dict[str, Tuple[int, int]]

    dictionary of shapes for each key

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    def load_data(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_types: Dict[str, np.dtype],\n    data_shapes: Dict[str, Tuple[int, int]],\n    extra_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Main method to load the data from a filetype structure like memmap or zarr.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_types:       dictionary of data types for each key\n        data_shapes:      dictionary of shapes for each key\n        extra_data_keys:  list of keys to load from the extra data file\n        overwrite:        whether to overwrite the local cache\n    \"\"\"\n    data = {}\n    for key in data_keys:\n        filename = self.join_and_ext(preprocess_path, key)\n        pull_locally(filename, overwrite=overwrite)\n        data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n        data[key] = self.unpack(data[key])\n        data[key] = data[key].reshape(*data_shapes[key])\n\n    data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_extra_files","title":"load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite) abstractmethod","text":"

    Load extra files required to define other types of data. Must be implemented by the child class.

    Parameters:

    Name Type Description Default data Dict[str, ndarray]

    dictionary of data to load

    required preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required pkl_data_keys List[str]

    list of keys to load from the extra files

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef load_extra_files(\n    self,\n    data: Dict[str, np.ndarray],\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    pkl_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Load extra files required to define other types of data.\n    Must be implemented by the child class.\n\n    Parameters:\n        data:  dictionary of data to load\n        preprocess_path:  path to the preprocessed data file\n        data_keys:    list of keys to load from the data file\n        pkl_data_keys:   list of keys to load from the extra files\n        overwrite:   whether to overwrite the local cache\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.save_preprocess","title":"save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) abstractmethod","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage. Must be implemented by the child class.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_dict Dict[str, ndarray]

    dictionary of data to save

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required extra_data_types Dict[str, type]

    dictionary of data types for each key

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef save_preprocess(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_dict: Dict[str, np.ndarray],\n    extra_data_keys: List[str],\n    extra_data_types: Dict[str, type],\n) -> List[str]:\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n    Must be implemented by the child class.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_dict:        dictionary of data to save\n        extra_data_keys:  list of keys to load from the extra data file\n        extra_data_types: dictionary of data types for each key\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.unpack","title":"unpack(data)","text":"

    Unpack the data from the loaded file.

    Parameters:

    Name Type Description Default data any

    the data to unpack

    required

    Returns:

    Type Description any

    the unpacked data

    Source code in openqdc/datasets/structure.py
    def unpack(self, data: any) -> any:\n    \"\"\"\n    Unpack the data from the loaded file.\n\n    Parameters:\n        data:  the data to unpack\n\n    Returns:\n        the unpacked data\n    \"\"\"\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.MemMapDataset","title":"MemMapDataset","text":"

    Bases: GeneralStructure

    Dataset structure for memory-mapped numpy arrays and props.pkl files.

    Source code in openqdc/datasets/structure.py
    class MemMapDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for memory-mapped numpy arrays and props.pkl files.\n    \"\"\"\n\n    _ext = \".mmap\"\n    _extra_files = [\"props.pkl\"]\n\n    @property\n    def load_fn(self):\n        return np.memmap\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        local_paths = []\n        for key in data_keys:\n            local_path = self.join_and_ext(preprocess_path, key)\n            out = np.memmap(local_path, mode=\"w+\", dtype=data_dict[key].dtype, shape=data_dict[key].shape)\n            out[:] = data_dict.pop(key)[:]\n            out.flush()\n            local_paths.append(local_path)\n\n        # save smiles and subset\n        local_path = p_join(preprocess_path, \"props.pkl\")\n\n        # assert that (required) pkl keys are present in data_dict\n        assert all([key in data_dict.keys() for key in extra_data_keys])\n\n        # store unique and inverse indices for str-based pkl keys\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        with open(local_path, \"wb\") as f:\n            pkl.dump(data_dict, f)\n\n        local_paths.append(local_path)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = p_join(preprocess_path, \"props.pkl\")\n        pull_locally(filename, overwrite=overwrite)\n        with open(filename, \"rb\") as f:\n            tmp = pkl.load(f)\n            all_pkl_keys = set(tmp.keys()) - set(data_keys)\n            # assert required pkl_keys are present in all_pkl_keys\n            assert all([key in all_pkl_keys for key in pkl_data_keys])\n            for key in all_pkl_keys:\n                x = tmp.pop(key)\n                if len(x) == 2:\n                    data[key] = x[0][x[1]]\n                else:\n                    data[key] = x\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.ZarrDataset","title":"ZarrDataset","text":"

    Bases: GeneralStructure

    Dataset structure for zarr files.

    Source code in openqdc/datasets/structure.py
    class ZarrDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for zarr files.\n    \"\"\"\n\n    _ext = \".zip\"\n    _extra_files = [\"metadata.zip\"]\n    _zarr_version = 2\n\n    @property\n    def load_fn(self):\n        return zarr.open\n\n    def unpack(self, data):\n        return data[:]\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        # os.makedirs(p_join(ds.root, \"zips\",  ds.__name__), exist_ok=True)\n        local_paths = []\n        for key, value in data_dict.items():\n            if key not in data_keys:\n                continue\n            zarr_path = self.join_and_ext(preprocess_path, key)\n            value = data_dict.pop(key)\n            z = zarr.open(\n                zarr.storage.ZipStore(zarr_path),\n                \"w\",\n                zarr_version=self._zarr_version,\n                shape=value.shape,\n                dtype=value.dtype,\n            )\n            z[:] = value[:]\n            local_paths.append(zarr_path)\n            # if key in attrs:\n            #    z.attrs.update(attrs[key])\n\n        metadata = p_join(preprocess_path, \"metadata.zip\")\n\n        group = zarr.group(zarr.storage.ZipStore(metadata))\n\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        for key, value in data_dict.items():\n            # sub=group.create_group(key)\n            if key in [\"name\", \"subset\"]:\n                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)\n                data[:] = value[0][:]\n                data2 = group.create_dataset(key + \"_ptr\", shape=value[1].shape, dtype=np.int32)\n                data2[:] = value[1][:]\n            else:\n                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)\n                data[:] = value[:]\n        local_paths.append(metadata)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = self.join_and_ext(preprocess_path, \"metadata\")\n        pull_locally(filename, overwrite=overwrite)\n        tmp = self.load_fn(filename)\n        all_pkl_keys = set(tmp.keys()) - set(data_keys)\n        # assert required pkl_keys are present in all_pkl_keys\n        assert all([key in all_pkl_keys for key in pkl_data_keys])\n        for key in all_pkl_keys:\n            if key not in pkl_data_keys:\n                data[key] = tmp[key][:][tmp[key][:]]\n            else:\n                data[key] = tmp[key][:]\n        return data\n
    "},{"location":"API/methods.html","title":"QM Methods","text":""},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod","title":"InteractionMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class InteractionMethod(QmMethod):\n    CCSD_T_NN = Functional.CCSDT, BasisSet.NN\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    DCCSDT_HA_DZ = Functional.DCCSDT, BasisSet.HA_DZ\n    DCCSDT_HA_TZ = Functional.DCCSDT, BasisSet.HA_TZ\n    DLPNO_CCSDT = Functional.DLPNO_CCSDT, BasisSet.NONE\n    DLPNO_CCSDT0 = (\n        Functional.DLPNO_CCSDT0,\n        BasisSet.NONE,\n    )\n    FN_DMC = Functional.FN_DMC, BasisSet.NONE\n    FIXED = Functional.FIXED, BasisSet.NONE\n    LNO_CCSDT = Functional.LNO_CCSDT, BasisSet.NONE\n    MP2_CBS = Functional.MP2, BasisSet.CBS\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MP2_5_CBS_ADZ = Functional.MP2_5, BasisSet.CBS_ADZ\n    MP2C_CBS = Functional.MP2C, BasisSet.CBS\n    QCISDT_CBS = Functional.QCISDT, BasisSet.CBS\n    SAPT0_AUG_CC_PWCVXZ = Functional.SAPT0, BasisSet.AUG_CC_PWCVXZ\n    SAPT0_JUN_CC_PVDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDZ\n    SAPT0_JUN_CC_PVDDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDDZ\n    SAPT0_AUG_CC_PVDDZ = Functional.SAPT0, BasisSet.AUG_CC_PVDDZ\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get an empty atomization energy dictionary because Interaction methods don't require this\"\"\"\n        return {}\n
    "},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get an empty atomization energy dictionary because Interaction methods don't require this

    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod","title":"PotentialMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1\n    B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP\n    B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ\n    B1LYP_VWN5_TZP = Functional.B1LYP_VWN5, BasisSet.TZP\n    B1PW91_VWN5_DZP = Functional.B1PW91_VWN5, BasisSet.DZP\n    B1PW91_VWN5_SZ = Functional.B1PW91_VWN5, BasisSet.SZ\n    B1PW91_VWN5_TZP = Functional.B1PW91_VWN5, BasisSet.TZP\n    B3LYP_STO3G = Functional.B3LYP, BasisSet.STO3G  # TODO: calculate e0s\n    B3LYP_VWN5_DZP = Functional.B3LYP_VWN5, BasisSet.DZP\n    B3LYP_VWN5_SZ = Functional.B3LYP_VWN5, BasisSet.SZ\n    B3LYP_VWN5_TZP = Functional.B3LYP_VWN5, BasisSet.TZP\n    B3LYP_S_VWN5_DZP = Functional.B3LYP_S_VWN5, BasisSet.DZP\n    B3LYP_S_VWN5_SZ = Functional.B3LYP_S_VWN5, BasisSet.SZ\n    B3LYP_S_VWN5_TZP = Functional.B3LYP_S_VWN5, BasisSet.TZP\n    B3LYP_D_DZP = Functional.B3LYPD, BasisSet.DZP\n    B3LYP_D_SZ = Functional.B3LYPD, BasisSet.SZ\n    B3LYP_D_TZP = Functional.B3LYPD, BasisSet.TZP\n    B3LYP_D3_BJ_DEF2_TZVP = Functional.B3LYP_D3_BJ, BasisSet.DEF2_TZVP\n    B3LYP_6_31G_D = Functional.B3LYP, BasisSet.GSTAR\n    B3LYP_DEF2_TZVP = Functional.B3LYP, BasisSet.DEF2_TZVP\n    B97_1_DZP = Functional.B97_1, BasisSet.DZP\n    B97_1_SZ = Functional.B97_1, BasisSet.SZ\n    B97_1_TZP = Functional.B97_1, BasisSet.TZP\n    B97_2_DZP = Functional.B97_2, BasisSet.DZP\n    B97_2_SZ = Functional.B97_2, BasisSet.SZ\n    B97_2_TZP = Functional.B97_2, BasisSet.TZP\n    B97_D_DZP = Functional.B97_D, BasisSet.DZP\n    B97_D_SZ = Functional.B97_D, BasisSet.SZ\n    B97_D_TZP = Functional.B97_D, BasisSet.TZP\n    B97_DZP = Functional.B97, BasisSet.DZP\n    B97_SZ = Functional.B97, BasisSet.SZ\n    B97_TZP = Functional.B97, BasisSet.TZP\n    BECKE00_X_ONLY_DZP = Functional.BECKE00_X_ONLY, BasisSet.DZP\n    BECKE00_X_ONLY_SZ = Functional.BECKE00_X_ONLY, BasisSet.SZ\n    BECKE00_X_ONLY_TZP = Functional.BECKE00_X_ONLY, BasisSet.TZP\n    BECKE00_DZP = Functional.BECKE00, BasisSet.DZP\n    BECKE00_SZ = Functional.BECKE00, BasisSet.SZ\n    BECKE00_TZP = Functional.BECKE00, BasisSet.TZP\n    BECKE00X_XC_DZP = Functional.BECKE00X_XC, BasisSet.DZP\n    BECKE00X_XC_SZ = Functional.BECKE00X_XC, BasisSet.SZ\n    BECKE00X_XC_TZP = Functional.BECKE00X_XC, BasisSet.TZP\n    BECKE88X_BR89C_DZP = Functional.BECKE88X_BR89C, BasisSet.DZP\n    BECKE88X_BR89C_SZ = Functional.BECKE88X_BR89C, BasisSet.SZ\n    BECKE88X_BR89C_TZP = Functional.BECKE88X_BR89C, BasisSet.TZP\n    BHANDH_DZP = Functional.BHANDH, BasisSet.DZP\n    BHANDH_SZ = Functional.BHANDH, BasisSet.SZ\n    BHANDH_TZP = Functional.BHANDH, BasisSet.TZP\n    BHANDHLYP_DZP = Functional.BHANDHLYP, BasisSet.DZP\n    BHANDHLYP_SZ = Functional.BHANDHLYP, BasisSet.SZ\n    BHANDHLYP_TZP = Functional.BHANDHLYP, BasisSet.TZP\n    BLAP3_DZP = Functional.BLAP3, BasisSet.DZP\n    BLAP3_SZ = Functional.BLAP3, BasisSet.SZ\n    BLAP3_TZP = Functional.BLAP3, BasisSet.TZP\n    BLYP_D_DZP = Functional.BLYPD, BasisSet.DZP\n    BLYP_D_SZ = Functional.BLYPD, BasisSet.SZ\n    BLYP_D_TZP = Functional.BLYPD, BasisSet.TZP\n    BLYP_DZP = Functional.BLYP, BasisSet.DZP\n    BLYP_SZ = Functional.BLYP, BasisSet.SZ\n    BLYP_TZP = Functional.BLYP, BasisSet.TZP\n    BMTAU1_DZP = Functional.BMTAU1, BasisSet.DZP\n    BMTAU1_SZ = Functional.BMTAU1, BasisSet.SZ\n    BMTAU1_TZP = Functional.BMTAU1, BasisSet.TZP\n    BOP_DZP = Functional.BOP, BasisSet.DZP\n    BOP_SZ = Functional.BOP, BasisSet.SZ\n    BOP_TZP = Functional.BOP, BasisSet.TZP\n    BP_DZP = Functional.BP, BasisSet.DZP\n    BP_SZ = Functional.BP, BasisSet.SZ\n    BP_TZP = Functional.BP, BasisSet.TZP\n    BP86_D_DZP = Functional.BP86_D, BasisSet.DZP\n    BP86_D_SZ = Functional.BP86_D, BasisSet.SZ\n    BP86_D_TZP = Functional.BP86_D, BasisSet.TZP\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ\n\n    DFT3B = Functional.DFT3B, BasisSet.NONE\n    DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP\n    FT97_DZP = Functional.FT97, BasisSet.DZP\n    FT97_SZ = Functional.FT97, BasisSet.SZ\n    FT97_TZP = Functional.FT97, BasisSet.TZP\n    GFN1_XTB = Functional.GFN1_XTB, BasisSet.NONE\n    GFN2_XTB = Functional.GFN2_XTB, BasisSet.NONE\n    HCTH_120_DZP = Functional.HCTH_120, BasisSet.DZP\n    HCTH_120_SZ = Functional.HCTH_120, BasisSet.SZ\n    HCTH_120_TZP = Functional.HCTH_120, BasisSet.TZP\n    HCTH_147_DZP = Functional.HCTH_147, BasisSet.DZP\n    HCTH_147_SZ = Functional.HCTH_147, BasisSet.SZ\n    HCTH_147_TZP = Functional.HCTH_147, BasisSet.TZP\n    HCTH_407_DZP = Functional.HCTH_407, BasisSet.DZP\n    HCTH_407_SZ = Functional.HCTH_407, BasisSet.SZ\n    HCTH_407_TZP = Functional.HCTH_407, BasisSet.TZP\n    HCTH_93_DZP = Functional.HCTH_93, BasisSet.DZP\n    HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ\n    HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP\n    HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP\n    HF_CC_PVDZ = (\n        Functional.HF,\n        BasisSet.CC_PVDZ,\n    )\n    HF_CC_PVQZ = (\n        Functional.HF,\n        BasisSet.CC_PVQZ,\n    )\n    HF_CC_PVTZ = (\n        Functional.HF,\n        BasisSet.CC_PVTZ,\n    )\n    KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP\n    KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ\n    KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP\n    KCIS_ORIGINAL_DZP = Functional.KCIS_ORIGINAL, BasisSet.DZP\n    KCIS_ORIGINAL_SZ = Functional.KCIS_ORIGINAL, BasisSet.SZ\n    KCIS_ORIGINAL_TZP = Functional.KCIS_ORIGINAL, BasisSet.TZP\n    KMLYP_VWN5_DZP = Functional.KMLYP_VWN5, BasisSet.DZP\n    KMLYP_VWN5_SZ = Functional.KMLYP_VWN5, BasisSet.SZ\n    KMLYP_VWN5_TZP = Functional.KMLYP_VWN5, BasisSet.TZP\n    KT1_DZP = Functional.KT1, BasisSet.DZP\n    KT1_SZ = Functional.KT1, BasisSet.SZ\n    KT1_TZP = Functional.KT1, BasisSet.TZP\n    KT2_DZP = Functional.KT2, BasisSet.DZP\n    KT2_SZ = Functional.KT2, BasisSet.SZ\n    KT2_TZP = Functional.KT2, BasisSet.TZP\n    LDA_VWN_DZP = Functional.LDA_VWN, BasisSet.DZP\n    LDA_VWN_SZ = Functional.LDA_VWN, BasisSet.SZ\n    LDA_VWN_TZP = Functional.LDA_VWN, BasisSet.TZP\n    M05_2X_DZP = Functional.M05_2X, BasisSet.DZP\n    M05_2X_SZ = Functional.M05_2X, BasisSet.SZ\n    M05_2X_TZP = Functional.M05_2X, BasisSet.TZP\n    M05_DZP = Functional.M05, BasisSet.DZP\n    M05_SZ = Functional.M05, BasisSet.SZ\n    M05_TZP = Functional.M05, BasisSet.TZP\n    M06_2X_DZP = Functional.M06_2X, BasisSet.DZP\n    M06_2X_SZ = Functional.M06_2X, BasisSet.SZ\n    M06_2X_TZP = Functional.M06_2X, BasisSet.TZP\n    M06_L_DZP = Functional.M06_L, BasisSet.DZP\n    M06_L_SZ = Functional.M06_L, BasisSet.SZ\n    M06_L_TZP = Functional.M06_L, BasisSet.TZP\n    M06_DZP = Functional.M06, BasisSet.DZP\n    M06_SZ = Functional.M06, BasisSet.SZ\n    M06_TZP = Functional.M06, BasisSet.TZP\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MPBE_DZP = Functional.MPBE, BasisSet.DZP\n    MPBE_SZ = Functional.MPBE, BasisSet.SZ\n    MPBE_TZP = Functional.MPBE, BasisSet.TZP\n    MPBE0KCIS_DZP = Functional.MPBE0KCIS, BasisSet.DZP\n    MPBE0KCIS_SZ = Functional.MPBE0KCIS, BasisSet.SZ\n    MPBE0KCIS_TZP = Functional.MPBE0KCIS, BasisSet.TZP\n    MPBE1KCIS_DZP = Functional.MPBE1KCIS, BasisSet.DZP\n    MPBE1KCIS_SZ = Functional.MPBE1KCIS, BasisSet.SZ\n    MPBE1KCIS_TZP = Functional.MPBE1KCIS, BasisSet.TZP\n    MPBEKCIS_DZP = Functional.MPBEKCIS, BasisSet.DZP\n    MPBEKCIS_SZ = Functional.MPBEKCIS, BasisSet.SZ\n    MPBEKCIS_TZP = Functional.MPBEKCIS, BasisSet.TZP\n    MPW_DZP = Functional.MPW, BasisSet.DZP\n    MPW_SZ = Functional.MPW, BasisSet.SZ\n    MPW_TZP = Functional.MPW, BasisSet.TZP\n    MPW1K_DZP = Functional.MPW1K, BasisSet.DZP\n    MPW1K_SZ = Functional.MPW1K, BasisSet.SZ\n    MPW1K_TZP = Functional.MPW1K, BasisSet.TZP\n    MPW1PW_DZP = Functional.MPW1PW, BasisSet.DZP\n    MPW1PW_SZ = Functional.MPW1PW, BasisSet.SZ\n    MPW1PW_TZP = Functional.MPW1PW, BasisSet.TZP\n    MVS_DZP = Functional.MVS, BasisSet.DZP\n    MVS_SZ = Functional.MVS, BasisSet.SZ\n    MVS_TZP = Functional.MVS, BasisSet.TZP\n    MVSX_DZP = Functional.MVSX, BasisSet.DZP\n    MVSX_SZ = Functional.MVSX, BasisSet.SZ\n    MVSX_TZP = Functional.MVSX, BasisSet.TZP\n    O3LYP_VWN5_DZP = Functional.O3LYP_VWN5, BasisSet.DZP\n    O3LYP_VWN5_SZ = Functional.O3LYP_VWN5, BasisSet.SZ\n    O3LYP_VWN5_TZP = Functional.O3LYP_VWN5, BasisSet.TZP\n    OLAP3_DZP = Functional.OLAP3, BasisSet.DZP\n    OLAP3_SZ = Functional.OLAP3, BasisSet.SZ\n    OLAP3_TZP = Functional.OLAP3, BasisSet.TZP\n    OLYP_DZP = Functional.OLYP, BasisSet.DZP\n    OLYP_SZ = Functional.OLYP, BasisSet.SZ\n    OLYP_TZP = Functional.OLYP, BasisSet.TZP\n    OPBE_DZP = Functional.OPBE, BasisSet.DZP\n    OPBE_SZ = Functional.OPBE, BasisSet.SZ\n    OPBE_TZP = Functional.OPBE, BasisSet.TZP\n    OPBE0_DZP = Functional.OPBE0, BasisSet.DZP\n    OPBE0_SZ = Functional.OPBE0, BasisSet.SZ\n    OPBE0_TZP = Functional.OPBE0, BasisSet.TZP\n    OPERDEW_DZP = Functional.OPERDEW, BasisSet.DZP\n    OPERDEW_SZ = Functional.OPERDEW, BasisSet.SZ\n    OPERDEW_TZP = Functional.OPERDEW, BasisSet.TZP\n    PBE_D_DZP = Functional.PBE_D, BasisSet.DZP\n    PBE_D_SZ = Functional.PBE_D, BasisSet.SZ\n    PBE_D_TZP = Functional.PBE_D, BasisSet.TZP\n    PBE_D3_BJ_DEF2_TZVP = Functional.PBE_D3_BJ, BasisSet.DEF2_TZVP\n    PBE_DEF2_TZVP = Functional.PBE, BasisSet.DEF2_TZVP\n    PBE_DZP = Functional.PBE, BasisSet.DZP\n    PBE_SZ = Functional.PBE, BasisSet.SZ\n    PBE_TZP = Functional.PBE, BasisSet.TZP\n    PBE0_DZP = Functional.PBE0, BasisSet.DZP\n    PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP\n    PBE0_SZ = Functional.PBE0, BasisSet.SZ\n    PBE0_TZP = Functional.PBE0, BasisSet.TZP\n    PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD\n    PBESOL_DZP = Functional.PBESOL, BasisSet.DZP\n    PBESOL_SZ = Functional.PBESOL, BasisSet.SZ\n    PBESOL_TZP = Functional.PBESOL, BasisSet.TZP\n    PKZB_DZP = Functional.PKZB, BasisSet.DZP\n    PKZB_SZ = Functional.PKZB, BasisSet.SZ\n    PKZB_TZP = Functional.PKZB, BasisSet.TZP\n    PKZBX_KCISCOR_DZP = Functional.PKZBX_KCISCOR, BasisSet.DZP\n    PKZBX_KCISCOR_SZ = Functional.PKZBX_KCISCOR, BasisSet.SZ\n    PKZBX_KCISCOR_TZP = Functional.PKZBX_KCISCOR, BasisSet.TZP\n    PM6 = Functional.PM6, BasisSet.NONE\n    PW91_DZP = Functional.PW91, BasisSet.DZP\n    PW91_SZ = Functional.PW91, BasisSet.SZ\n    PW91_TZP = Functional.PW91, BasisSet.TZP\n    REVPBE_D3_BJ_DEF2_TZVP = Functional.REVPBE_D3_BJ, BasisSet.DEF2_TZVP\n    REVPBE_DZP = Functional.REVPBE, BasisSet.DZP\n    REVPBE_SZ = Functional.REVPBE, BasisSet.SZ\n    REVPBE_TZP = Functional.REVPBE, BasisSet.TZP\n    REVTPSS_DZP = Functional.REVTPSS, BasisSet.DZP\n    REVTPSS_SZ = Functional.REVTPSS, BasisSet.SZ\n    REVTPSS_TZP = Functional.REVTPSS, BasisSet.TZP\n    RGE2_DZP = Functional.RGE2, BasisSet.DZP\n    RGE2_SZ = Functional.RGE2, BasisSet.SZ\n    RGE2_TZP = Functional.RGE2, BasisSet.TZP\n    RPBE_DZP = Functional.RPBE, BasisSet.DZP\n    RPBE_SZ = Functional.RPBE, BasisSet.SZ\n    RPBE_TZP = Functional.RPBE, BasisSet.TZP\n    SSB_D_DZP = Functional.SSB_D, BasisSet.DZP\n    SSB_D_SZ = Functional.SSB_D, BasisSet.SZ\n    SSB_D_TZP = Functional.SSB_D, BasisSet.TZP\n    SVWN_DEF2_TZVP = Functional.SVWN, BasisSet.DEF2_TZVP\n    TMGGA_DZP = Functional.TMGGA, BasisSet.DZP\n    TMGGA_SZ = Functional.TMGGA, BasisSet.SZ\n    TMGGA_TZP = Functional.TMGGA, BasisSet.TZP\n    TAU_HCTH_HYBRID_DZP = Functional.TAU_HCTH_HYBRID, BasisSet.DZP\n    TAU_HCTH_HYBRID_SZ = Functional.TAU_HCTH_HYBRID, BasisSet.SZ\n    TAU_HCTH_HYBRID_TZP = Functional.TAU_HCTH_HYBRID, BasisSet.TZP\n    TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP\n    TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ\n    TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP\n    TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ\n    TPSSD_DZP = Functional.TPSSD, BasisSet.DZP\n    TPSSD_SZ = Functional.TPSSD, BasisSet.SZ\n    TPSSD_TZP = Functional.TPSSD, BasisSet.TZP\n    TPSS_DZP = Functional.TPSS, BasisSet.DZP\n    TPSS_SZ = Functional.TPSS, BasisSet.SZ\n    TPSS_TZP = Functional.TPSS, BasisSet.TZP\n    TPSSH_DEF2_TZVP = Functional.TPSSH, BasisSet.DEF2_TZVP\n    TPSSH_DZP = Functional.TPSSH, BasisSet.DZP\n    TPSSH_SZ = Functional.TPSSH, BasisSet.SZ\n    TPSSH_TZP = Functional.TPSSH, BasisSet.TZP\n    TTM2_1_F = Functional.TTM2_1_F, BasisSet.NONE\n    VS98_X_XC_DZP = Functional.VS98_X_XC, BasisSet.DZP\n    VS98_X_XC_SZ = Functional.VS98_X_XC, BasisSet.SZ\n    VS98_X_XC_TZP = Functional.VS98_X_XC, BasisSet.TZP\n    VS98_X_ONLY_DZP = Functional.VS98_X_ONLY, BasisSet.DZP\n    VS98_X_ONLY_SZ = Functional.VS98_X_ONLY, BasisSet.SZ\n    VS98_X_ONLY_TZP = Functional.VS98_X_ONLY, BasisSet.TZP\n    VS98_DZP = Functional.VS98, BasisSet.DZP\n    VS98_SZ = Functional.VS98, BasisSet.SZ\n    VS98_TZP = Functional.VS98, BasisSet.TZP\n    WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD\n    WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP\n    WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP\n    WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ\n    WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR\n    WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ\n    X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP\n    X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ\n    X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP\n    XLYP_DZP = Functional.XLYP, BasisSet.DZP\n    XLYP_SZ = Functional.XLYP, BasisSet.SZ\n    XLYP_TZP = Functional.XLYP, BasisSet.TZP\n    NONE = Functional.NONE, BasisSet.NONE\n\n    def _build_default_dict(self):\n        e0_dict = {}\n        for SYMBOL in ATOM_SYMBOLS:\n            for CHARGE in range(-10, 11):\n                e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32)\n        return e0_dict\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        key = str(self)\n        try:\n            # print(key)\n            energies = atom_energy_collection.get(key, {})\n            if len(energies) == 0:\n                raise\n        except:  # noqa\n            logger.info(f\"No available atomization energy for the QM method {key}. All values are set to 0.\")\n            energies = self._build_default_dict()\n        return energies\n
    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod","title":"QmMethod","text":"

    Bases: Enum

    Source code in openqdc/methods/enums.py
    class QmMethod(Enum):\n    def __init__(self, functional: Functional, basis_set: BasisSet, cost: float = 0):\n        self.functional = functional\n        self.basis_set = basis_set\n        self.cost = cost\n\n    def __str__(self):\n        if self.basis_set != \"\":\n            s = \"/\".join([str(self.functional), str(self.basis_set)])\n        else:\n            s = str(self.functional)\n        return s\n\n    @property\n    def atom_energies_matrix(self):\n        \"\"\"Get the atomization energy matrix\"\"\"\n        energies = self.atom_energies_dict\n        mat = to_e_matrix(energies)\n\n        return mat\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_matrix","title":"atom_energies_matrix property","text":"

    Get the atomization energy matrix

    "},{"location":"API/methods.html#isolated-atom-energies","title":"Isolated Atom Energies","text":""},{"location":"API/methods.html#openqdc.methods.atom_energies.to_e_matrix","title":"to_e_matrix(atom_energies)","text":"

    Get the matrix of isolated atom energies for a dict of non-null values calculates

    Parameters:

    Name Type Description Default atom_energies Dict

    Dict of energies computed for a given QM method. Keys are pairs of (atom, charge) and values are energy values

    required

    np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)

    Type Description ndarray

    Matrix containing the isolated atom energies for each atom and charge written in the form:

            |   | -2 | -1 | 0 | +1 | +2 | <- charges\n        |---|----|----|---|----|----|\n        | 0 |    |    |   |    |    |\n        | 1 |    |    |   |    |    |\n        | 2 |    |    |   |    |    |\n
    Source code in openqdc/methods/atom_energies.py
    def to_e_matrix(atom_energies: Dict) -> np.ndarray:\n    \"\"\"\n    Get the matrix of isolated atom energies for a dict of non-null values calculates\n\n    Parameters:\n        atom_energies: Dict of energies computed for a given QM method.\n            Keys are pairs of (atom, charge) and values are energy values\n\n    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)\n        Matrix containing the isolated atom energies for each atom and charge written in the form:\n\n                        |   | -2 | -1 | 0 | +1 | +2 | <- charges\n                        |---|----|----|---|----|----|\n                        | 0 |    |    |   |    |    |\n                        | 1 |    |    |   |    |    |\n                        | 2 |    |    |   |    |    |\n    \"\"\"\n\n    matrix = np.zeros((MAX_ATOMIC_NUMBER, MAX_CHARGE_NUMBER))\n    if len(atom_energies) > 0:\n        for key in atom_energies.keys():\n            try:\n                matrix[ATOMIC_NUMBERS[key[0]], key[1] + MAX_CHARGE] = atom_energies[key]\n            except KeyError:\n                logger.error(f\"Isolated atom energies not found for {key}\")\n    return matrix\n
    "},{"location":"API/regressor.html","title":"Normalization regressor","text":"

    Linear Atom Energies regression utilities.

    "},{"location":"API/regressor.html#openqdc.utils.regressor.LinearSolver","title":"LinearSolver","text":"

    Bases: Solver

    Linear regression solver.

    Note

    No Uncertainty associated as it is quite small.

    Source code in openqdc/utils/regressor.py
    class LinearSolver(Solver):\n    \"\"\"\n    Linear regression solver.\n\n    Note:\n        No Uncertainty associated as it is quite small.\n    \"\"\"\n\n    _regr_str = \"linear\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        E0s = np.linalg.lstsq(X, y, rcond=None)[0]\n        return E0s, None\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor","title":"Regressor","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies. A isolated atom energy regression problem is defined as:

    X = [n_samples, n_species] (number of atoms of each species per sample)

    Y = [n_samples, ] (energies)

    The regression problem is solved by solving the linear system X E0 = Y.

    Example

    For a sytem of 2 samples (H20, CH4)

    n_species = 3, n_samples = 2\n\nH20 = 2H , 1O -> X = [2, 1, 0]\n\nCH4 = 4C, 1H -> X = [1, 0, 4]\n\nX = [[2, 1, 0],\n    [ 1, 0, 4]]\n\nY = [[10, 20]]\n\nX E0 = Y\n

    Linear system to solve

    [[2 eH, 1 eO, 0 eC],\n[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n
    Source code in openqdc/utils/regressor.py
    class Regressor:\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n    A isolated atom energy regression problem is defined as:\\n\n    X = [n_samples, n_species] (number of atoms of each species per sample)\\n\n    Y = [n_samples, ] (energies)\\n\n    The regression problem is solved by solving the linear system X E0 = Y.\n\n    Example:\n        For a sytem of 2 samples (H20, CH4)\\n\n            n_species = 3, n_samples = 2\\n\n            H20 = 2H , 1O -> X = [2, 1, 0]\\n\n            CH4 = 4C, 1H -> X = [1, 0, 4]\\n\n            X = [[2, 1, 0],\n                [ 1, 0, 4]]\\n\n            Y = [[10, 20]]\\n\n            X E0 = Y\\n\n        Linear system to solve\\n\n            [[2 eH, 1 eO, 0 eC],\n            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n    \"\"\"\n\n    solver: Solver\n\n    def __init__(\n        self,\n        energies: np.ndarray,\n        atomic_numbers: np.ndarray,\n        position_idx_range: np.ndarray,\n        solver_type: str = \"linear\",\n        stride: int = 1,\n        subsample: Optional[Union[float, int]] = None,\n        remove_nan: bool = True,\n        *args: any,\n        **kwargs: any,\n    ):\n        \"\"\"\n        Regressor class for preparing and solving regression problem for isolated atom energies.\n\n        Parameters:\n            energies:\n                numpy array of energies in the shape (n_samples, n_energy_methods)\n            atomic_numbers:\n                numpy array of atomic numbers in the shape (n_atoms,)\n            position_idx_range:\n                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n            solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n            stride: Stride to use for the regression.\n            subsample: Sumsample the dataset.\n                If a float, it is interpreted as a fraction of the dataset to use.\n                If >1 it is interpreted as the number of samples to use.\n            remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n        \"\"\"\n        self.subsample = subsample\n        self.stride = stride\n        self.solver_type = solver_type.lower()\n        self.energies = energies\n        self.atomic_numbers = atomic_numbers\n        self.numbers = pd.unique(atomic_numbers)\n        self.position_idx_range = position_idx_range\n        self.remove_nan = remove_nan\n        self.hparams = {\n            \"subsample\": subsample,\n            \"stride\": stride,\n            \"solver_type\": solver_type,\n        }\n        self._post_init()\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n        \"\"\"\n        Initialize the regressor object from an openqdc dataset. This is the default method.\n        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n        Parameters:\n            dataset: openqdc dataset object.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n\n        Returns:\n            Instance of the regressor class.\n        \"\"\"\n        energies = dataset.data[\"energies\"]\n        position_idx_range = dataset.data[\"position_idx_range\"]\n        atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n        return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n\n    def _post_init(self):\n        if self.subsample is not None:\n            self._downsample()\n        self._prepare_inputs()\n        self.solver = self._get_solver()\n\n    def update_hparams(self, hparams):\n        self.hparams.update(hparams)\n\n    def _downsample(self):\n        if self.subsample < 1:\n            idxs = np.arange(self.energies.shape[0])\n            np.random.shuffle(idxs)\n            idxs = idxs[: int(self.energies.shape[0] * self.subsample)]\n            self.energies = self.energies[:: int(1 / self.subsample)]\n            self.position_idx_range = self.position_idx_range[:: int(1 / self.subsample)]\n        else:\n            idxs = np.random.randint(0, self.energies.shape[0], int(self.subsample))\n            self.energies = self.energies[idxs]\n            self.position_idx_range = self.position_idx_range[idxs]\n        self.update_hparams({\"idxs\": idxs})\n\n    def _get_solver(self):\n        try:\n            return AVAILABLE_SOLVERS[self.solver_type]()\n        except KeyError:\n            logger.warning(f\"Unknown solver type {self.solver_type}, defaulting to linear regression.\")\n            return LinearSolver()\n\n    def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:\n        logger.info(\"Preparing inputs for regression.\")\n        len_train = self.energies.shape[0]\n        len_zs = len(self.numbers)\n        A = np.zeros((len_train, len_zs))[:: self.stride]\n        B = self.energies[:: self.stride]\n        for i, ij in enumerate(self.position_idx_range[:: self.stride]):\n            tmp = self.atomic_numbers[ij[0] : ij[1]]\n            for j, z in enumerate(self.numbers):\n                A[i, j] = np.count_nonzero(tmp == z)\n        self.X = A\n        self.y = B\n\n    def solve(self):\n        \"\"\"\n        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n        \"\"\"\n        logger.info(f\"Solving regression with {self.solver}.\")\n        E0_list, cov_list = [], []\n        for energy_idx in range(self.y.shape[1]):\n            if self.remove_nan:\n                idxs = non_nan_idxs(self.y[:, energy_idx])\n                X, y = self.X[idxs], self.y[idxs, energy_idx]\n            else:\n                X, y = self.X, self.y[:, energy_idx]\n            E0s, cov = self.solver(X, y)\n            if cov is None:\n                cov = np.zeros_like(E0s) + 1.0\n            E0_list.append(E0s)\n            cov_list.append(cov)\n        return np.vstack(E0_list).T, np.vstack(cov_list).T\n\n    def __call__(self):\n        return self.solve()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.__init__","title":"__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies.

    Parameters:

    Name Type Description Default energies ndarray

    numpy array of energies in the shape (n_samples, n_energy_methods)

    required atomic_numbers ndarray

    numpy array of atomic numbers in the shape (n_atoms,)

    required position_idx_range ndarray

    array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset

    required solver_type str

    Type of solver to use. [\"linear\", \"ridge\"]

    'linear' stride int

    Stride to use for the regression.

    1 subsample Optional[Union[float, int]]

    Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. If >1 it is interpreted as the number of samples to use.

    None remove_nan bool

    Sanitize the dataset by removing energies samples with NaN values.

    True *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {} Source code in openqdc/utils/regressor.py
    def __init__(\n    self,\n    energies: np.ndarray,\n    atomic_numbers: np.ndarray,\n    position_idx_range: np.ndarray,\n    solver_type: str = \"linear\",\n    stride: int = 1,\n    subsample: Optional[Union[float, int]] = None,\n    remove_nan: bool = True,\n    *args: any,\n    **kwargs: any,\n):\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n\n    Parameters:\n        energies:\n            numpy array of energies in the shape (n_samples, n_energy_methods)\n        atomic_numbers:\n            numpy array of atomic numbers in the shape (n_atoms,)\n        position_idx_range:\n            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n        solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n        stride: Stride to use for the regression.\n        subsample: Sumsample the dataset.\n            If a float, it is interpreted as a fraction of the dataset to use.\n            If >1 it is interpreted as the number of samples to use.\n        remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n    \"\"\"\n    self.subsample = subsample\n    self.stride = stride\n    self.solver_type = solver_type.lower()\n    self.energies = energies\n    self.atomic_numbers = atomic_numbers\n    self.numbers = pd.unique(atomic_numbers)\n    self.position_idx_range = position_idx_range\n    self.remove_nan = remove_nan\n    self.hparams = {\n        \"subsample\": subsample,\n        \"stride\": stride,\n        \"solver_type\": solver_type,\n    }\n    self._post_init()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, *args, **kwargs) classmethod","text":"

    Initialize the regressor object from an openqdc dataset. This is the default method. args and and *kwargs are passed to the init method and depends on the specific regressor.

    Parameters:

    Name Type Description Default dataset any

    openqdc dataset object.

    required *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {}

    Returns:

    Type Description Regressor

    Instance of the regressor class.

    Source code in openqdc/utils/regressor.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n    \"\"\"\n    Initialize the regressor object from an openqdc dataset. This is the default method.\n    *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n    Parameters:\n        dataset: openqdc dataset object.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n\n    Returns:\n        Instance of the regressor class.\n    \"\"\"\n    energies = dataset.data[\"energies\"]\n    position_idx_range = dataset.data[\"position_idx_range\"]\n    atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n    return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.solve","title":"solve()","text":"

    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    def solve(self):\n    \"\"\"\n    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n    \"\"\"\n    logger.info(f\"Solving regression with {self.solver}.\")\n    E0_list, cov_list = [], []\n    for energy_idx in range(self.y.shape[1]):\n        if self.remove_nan:\n            idxs = non_nan_idxs(self.y[:, energy_idx])\n            X, y = self.X[idxs], self.y[idxs, energy_idx]\n        else:\n            X, y = self.X, self.y[:, energy_idx]\n        E0s, cov = self.solver(X, y)\n        if cov is None:\n            cov = np.zeros_like(E0s) + 1.0\n        E0_list.append(E0s)\n        cov_list.append(cov)\n    return np.vstack(E0_list).T, np.vstack(cov_list).T\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.RidgeSolver","title":"RidgeSolver","text":"

    Bases: Solver

    Ridge regression solver.

    Source code in openqdc/utils/regressor.py
    class RidgeSolver(Solver):\n    \"\"\"\n    Ridge regression solver.\n    \"\"\"\n\n    _regr_str = \"ridge\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        A = X.T @ X\n        dy = y - (np.sum(X, axis=1, keepdims=True) * y_mean).reshape(y.shape)\n        Xy = X.T @ dy\n        mean = np.linalg.solve(A, Xy)\n        sigma2 = np.var(X @ mean - dy)\n        Ainv = np.linalg.inv(A)\n        cov = np.sqrt(sigma2 * np.einsum(\"ij,kj,kl,li->i\", Ainv, X, X, Ainv))\n        mean = mean + y_mean.reshape([-1])\n        return mean, cov\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver","title":"Solver","text":"

    Bases: ABC

    Abstract class for regression solvers.

    Source code in openqdc/utils/regressor.py
    class Solver(ABC):\n    \"\"\"Abstract class for regression solvers.\"\"\"\n\n    _regr_str: str\n\n    @staticmethod\n    @abstractmethod\n    def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Main method to solve the regression problem.\n        Must be implemented in all the subclasses.\n\n        Parameters:\n            X: Input features of shape (n_samples, n_species)\n            Y: Target values of shape (n_samples,) (energy values for the regression)\n\n        Returns:\n            Tuple of predicted values and the estimated uncertainty.\n        \"\"\"\n        pass\n\n    def __call__(self, X, Y):\n        return self.solve(X, Y)\n\n    def __str__(self):\n        return self._regr_str\n\n    def __repr__(self):\n        return str(self)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver.solve","title":"solve(X, Y) abstractmethod staticmethod","text":"

    Main method to solve the regression problem. Must be implemented in all the subclasses.

    Parameters:

    Name Type Description Default X ndarray

    Input features of shape (n_samples, n_species)

    required Y ndarray

    Target values of shape (n_samples,) (energy values for the regression)

    required

    Returns:

    Type Description Tuple[ndarray, Optional[ndarray]]

    Tuple of predicted values and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    @staticmethod\n@abstractmethod\ndef solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n    \"\"\"\n    Main method to solve the regression problem.\n    Must be implemented in all the subclasses.\n\n    Parameters:\n        X: Input features of shape (n_samples, n_species)\n        Y: Target values of shape (n_samples,) (energy values for the regression)\n\n    Returns:\n        Tuple of predicted values and the estimated uncertainty.\n    \"\"\"\n    pass\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.atom_standardization","title":"atom_standardization(X, y)","text":"

    Standardize the energies and the atom counts. This will make the calculated uncertainty more meaningful.

    Source code in openqdc/utils/regressor.py
    def atom_standardization(X, y):\n    \"\"\"\n    Standardize the energies and the atom counts.\n    This will make the calculated uncertainty more\n    meaningful.\n    \"\"\"\n    X_norm = X.sum()\n    X = X / X_norm\n    y = y / X_norm\n    y_mean = y.sum() / X.sum()\n    return X, y, y_mean\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.non_nan_idxs","title":"non_nan_idxs(array)","text":"

    Return non nan indices of an array.

    Source code in openqdc/utils/regressor.py
    def non_nan_idxs(array):\n    \"\"\"\n    Return non nan indices of an array.\n    \"\"\"\n    return np.where(~np.isnan(array))[0]\n
    "},{"location":"API/units.html","title":"UNITS","text":"

    Units conversion utilities module.

    Available Energy units

    [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\" \"mev\", \"ryd]

    Available Distance units

    [\"ang\", \"nm\", \"bohr\"]

    Available Force units

    Combinations between Energy and Distance units

    "},{"location":"API/units.html#openqdc.utils.units.Conversion","title":"Conversion","text":"

    Conversion from one unit system to another defined by a name and a callable

    Source code in openqdc/utils/units.py
    class Conversion:\n    \"\"\"\n    Conversion from one unit system to another defined by a name and a callable\n    \"\"\"\n\n    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n        \"\"\"\n\n        Parameters:\n            in_unit: String defining the units of the current values\n            out_unit: String defining the target units\n            func: The callable to compute the conversion\n        \"\"\"\n        name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n        if name in CONVERSION_REGISTRY:\n            raise ConversionAlreadyDefined(in_unit, out_unit)\n        CONVERSION_REGISTRY[name] = self\n\n        self.name = name\n        self.fn = func\n\n    def __call__(self, x):\n        return self.fn(x)\n
    "},{"location":"API/units.html#openqdc.utils.units.Conversion.__init__","title":"__init__(in_unit, out_unit, func)","text":"

    Parameters:

    Name Type Description Default in_unit str

    String defining the units of the current values

    required out_unit str

    String defining the target units

    required func Callable[[float], float]

    The callable to compute the conversion

    required Source code in openqdc/utils/units.py
    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n    \"\"\"\n\n    Parameters:\n        in_unit: String defining the units of the current values\n        out_unit: String defining the target units\n        func: The callable to compute the conversion\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n    if name in CONVERSION_REGISTRY:\n        raise ConversionAlreadyDefined(in_unit, out_unit)\n    CONVERSION_REGISTRY[name] = self\n\n    self.name = name\n    self.fn = func\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion","title":"DistanceTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible distance units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass DistanceTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible distance units for conversion\n    \"\"\"\n\n    ANG = \"ang\"\n    NM = \"nm\"\n    BOHR = \"bohr\"\n\n    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the distance to the desired units.\n\n        Parameters:\n            distance: distance unit to convert to\n            fraction: whether it is distance^1 or distance^-1\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion.to","title":"to(distance, fraction=False)","text":"

    Get the conversion function to convert the distance to the desired units.

    Parameters:

    Name Type Description Default distance DistanceTypeConversion

    distance unit to convert to

    required fraction bool

    whether it is distance^1 or distance^-1

    False

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the distance to the desired units.\n\n    Parameters:\n        distance: distance unit to convert to\n        fraction: whether it is distance^1 or distance^-1\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion","title":"EnergyTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible energy units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass EnergyTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible energy units for conversion\n    \"\"\"\n\n    KCAL_MOL = \"kcal/mol\"\n    KJ_MOL = \"kj/mol\"\n    HARTREE = \"hartree\"\n    EV = \"ev\"\n    MEV = \"mev\"\n    RYD = \"ryd\"\n\n    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the energy to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n\n        Returns:\n            Callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion.to","title":"to(energy)","text":"

    Get the conversion function to convert the energy to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    Callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the energy to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n\n    Returns:\n        Callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion","title":"ForceTypeConversion","text":"

    Bases: ConversionEnum

    Define the possible foce units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass ForceTypeConversion(ConversionEnum):\n    \"\"\"\n    Define the possible foce units for conversion\n    \"\"\"\n\n    #     Name      = EnergyTypeConversion,         , DistanceTypeConversion\n    HARTREE_BOHR = EnergyTypeConversion.HARTREE, DistanceTypeConversion.BOHR\n    HARTREE_ANG = EnergyTypeConversion.HARTREE, DistanceTypeConversion.ANG\n    HARTREE_NM = EnergyTypeConversion.HARTREE, DistanceTypeConversion.NM\n    EV_BOHR = EnergyTypeConversion.EV, DistanceTypeConversion.BOHR\n    EV_ANG = EnergyTypeConversion.EV, DistanceTypeConversion.ANG\n    EV_NM = EnergyTypeConversion.EV, DistanceTypeConversion.NM\n    KCAL_MOL_BOHR = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.BOHR\n    KCAL_MOL_ANG = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.ANG\n    KCAL_MOL_NM = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.NM\n    KJ_MOL_BOHR = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.BOHR\n    KJ_MOL_ANG = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.ANG\n    KJ_MOL_NM = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.NM\n    MEV_BOHR = EnergyTypeConversion.MEV, DistanceTypeConversion.BOHR\n    MEV_ANG = EnergyTypeConversion.MEV, DistanceTypeConversion.ANG\n    MEV_NM = EnergyTypeConversion.MEV, DistanceTypeConversion.NM\n    RYD_BOHR = EnergyTypeConversion.RYD, DistanceTypeConversion.BOHR\n    RYD_ANG = EnergyTypeConversion.RYD, DistanceTypeConversion.ANG\n    RYD_NM = EnergyTypeConversion.RYD, DistanceTypeConversion.NM\n\n    def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):\n        self.energy = energy\n        self.distance = distance\n\n    def __str__(self):\n        return f\"{self.energy}/{self.distance}\"\n\n    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the force to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n            distance: distance unit to convert to\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion.to","title":"to(energy, distance)","text":"

    Get the conversion function to convert the force to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required distance DistanceTypeConversion

    distance unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the force to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n        distance: distance unit to convert to\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html","title":"Utils","text":""},{"location":"API/utils.html#openqdc.utils.check_file","title":"check_file(path)","text":"

    Checks if file present on local

    Source code in openqdc/utils/io.py
    def check_file(path) -> bool:\n    \"\"\"Checks if file present on local\"\"\"\n    return os.path.exists(path)\n
    "},{"location":"API/utils.html#openqdc.utils.create_hdf5_file","title":"create_hdf5_file(hdf5_file_path)","text":"

    Creates hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def create_hdf5_file(hdf5_file_path: str):\n    \"\"\"Creates hdf5 file with fsspec\"\"\"\n    fp = fsspec.open(hdf5_file_path, \"wb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    return h5py.File(fp, \"a\")\n
    "},{"location":"API/utils.html#openqdc.utils.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html#openqdc.utils.get_local_cache","title":"get_local_cache()","text":"

    Returns the local cache directory. It creates it if it does not exist.

    Returns:

    Name Type Description str str

    path to the local cache directory

    Source code in openqdc/utils/io.py
    def get_local_cache() -> str:\n    \"\"\"\n    Returns the local cache directory. It creates it if it does not exist.\n\n    Returns:\n        str: path to the local cache directory\n    \"\"\"\n    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))\n    os.makedirs(cache_dir, exist_ok=True)\n    return cache_dir\n
    "},{"location":"API/utils.html#openqdc.utils.get_remote_cache","title":"get_remote_cache(write_access=False)","text":"

    Returns the entry point based on the write access.

    Source code in openqdc/utils/io.py
    def get_remote_cache(write_access=False) -> str:\n    \"\"\"\n    Returns the entry point based on the write access.\n    \"\"\"\n    if write_access:\n        remote_cache = \"openqdc/v1\"  # \"gs://qmdata-public/openqdc\"\n        # remote_cache = \"gs://qmdata-public/openqdc\"\n    else:\n        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get(\"OPENQDC_DOWNLOAD_API\", \"s3\"))\n        # remote_cache = \"https://storage.googleapis.com/qmdata-public/openqdc\"\n    return remote_cache\n
    "},{"location":"API/utils.html#openqdc.utils.load_hdf5_file","title":"load_hdf5_file(hdf5_file_path)","text":"

    Loads hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def load_hdf5_file(hdf5_file_path: str):\n    \"\"\"Loads hdf5 file with fsspec\"\"\"\n    if not check_file(hdf5_file_path):\n        raise FileNotFoundError(f\"File {hdf5_file_path} does not exist on GCS and local.\")\n\n    fp = fsspec.open(hdf5_file_path, \"rb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    file = h5py.File(fp)\n\n    # inorder to enable multiprocessing:\n    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801\n    # fsspec.asyn.iothread[0] = None\n    # fsspec.asyn.loop[0] = None\n\n    return file\n
    "},{"location":"API/utils.html#openqdc.utils.load_json","title":"load_json(path)","text":"

    Loads json file

    Source code in openqdc/utils/io.py
    def load_json(path):\n    \"\"\"Loads json file\"\"\"\n    with fsspec.open(path, \"r\") as fp:  # Unpickling\n        return json.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.load_pkl","title":"load_pkl(path, check=True)","text":"

    Load pkl file

    Source code in openqdc/utils/io.py
    def load_pkl(path, check=True):\n    \"\"\"Load pkl file\"\"\"\n    if check:\n        if not check_file(path):\n            raise FileNotFoundError(f\"File {path} does not exist on GCS and local.\")\n\n    with open(path, \"rb\") as fp:  # Unpickling\n        return pkl.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.makedirs","title":"makedirs(path, exist_ok=True)","text":"

    Creates directory

    Source code in openqdc/utils/io.py
    def makedirs(path, exist_ok=True):\n    \"\"\"Creates directory\"\"\"\n    os.makedirs(path, exist_ok=exist_ok)\n
    "},{"location":"API/utils.html#openqdc.utils.read_qc_archive_h5","title":"read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)","text":"

    Extracts data from the HDF5 archive file.

    Source code in openqdc/utils/io.py
    def read_qc_archive_h5(\n    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None\n) -> List[Dict[str, np.ndarray]]:\n    \"\"\"Extracts data from the HDF5 archive file.\"\"\"\n    data = load_hdf5_file(raw_path)\n    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}\n\n    n = len(data_t[\"molecule_id\"])\n    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]\n    return samples\n
    "},{"location":"API/utils.html#openqdc.utils.save_pkl","title":"save_pkl(file, path)","text":"

    Saves pkl file

    Source code in openqdc/utils/io.py
    def save_pkl(file, path):\n    \"\"\"Saves pkl file\"\"\"\n    logger.info(f\"Saving file at {path}\")\n    with fsspec.open(path, \"wb\") as fp:  # Pickling\n        pkl.dump(file, fp)\n
    "},{"location":"API/utils.html#openqdc.utils.set_cache_dir","title":"set_cache_dir(d)","text":"

    Optionally set the _OPENQDC_CACHE_DIR directory.

    Parameters:

    Name Type Description Default d str

    path to a local folder.

    required Source code in openqdc/utils/io.py
    def set_cache_dir(d):\n    r\"\"\"\n    Optionally set the _OPENQDC_CACHE_DIR directory.\n\n    Args:\n        d (str): path to a local folder.\n    \"\"\"\n    if d is None:\n        return\n    global _OPENQDC_CACHE_DIR\n    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))\n
    "},{"location":"API/datasets/alchemy.html","title":"Alchemy","text":""},{"location":"API/datasets/alchemy.html#openqdc.datasets.potential.alchemy.Alchemy","title":"Alchemy","text":"

    Bases: BaseDataset

    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange matrix.

    Usage:

    from openqdc.datasets import Alchemy\ndataset = Alchemy()\n

    Reference

    https://arxiv.org/abs/1906.09427 https://alchemy.tencent.com/

    Source code in openqdc/datasets/potential/alchemy.py
    class Alchemy(BaseDataset):\n    \"\"\"\n    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.\n    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level\n    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used\n    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G\n    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the\n    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The\n    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange\n    matrix.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Alchemy\n    dataset = Alchemy()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/1906.09427\n        https://alchemy.tencent.com/\n    \"\"\"\n\n    __name__ = \"alchemy\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\"alchemy.zip\": \"https://alchemy.tencent.com/data/alchemy-v20191129.zip\"}\n\n    def read_raw_entries(self):\n        dir_path = p_join(self.root, \"Alchemy-v20191129\")\n        full_csv = pd.read_csv(p_join(dir_path, \"final_version.csv\"))\n        energies = full_csv[\"U0\\n(Ha, internal energy at 0 K)\"].tolist()\n        atom_folder = full_csv[\"atom number\"]\n        gdb_idx = full_csv[\"gdb_idx\"]\n        idxs = full_csv.index.tolist()\n        samples = []\n        for i in tqdm(idxs):\n            sdf_file = p_join(dir_path, f\"atom_{atom_folder[i]}\", f\"{gdb_idx[i]}.sdf\")\n            energy = energies[i]\n            samples.append(read_mol(sdf_file, energy))\n        return samples\n
    "},{"location":"API/datasets/ani.html","title":"ANI","text":""},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1","title":"ANI1","text":"

    Bases: BaseDataset

    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.

    Usage:

    from openqdc.datasets import ANI1\ndataset = ANI1()\n

    References

    https://www.nature.com/articles/sdata2017193

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1(BaseDataset):\n    \"\"\"\n    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic\n    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the\n    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules\n    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary\n    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT\n    level.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1\n    dataset = ANI1()\n    ```\n\n    References:\n        https://www.nature.com/articles/sdata2017193\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"ani1.hdf5.gz\": \"https://zenodo.org/record/3585840/files/214.hdf5.gz\"}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"ani\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"ani\", links=self.__links__)\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)\n        return samples\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX","title":"ANI1CCX","text":"

    Bases: ANI1

    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.

    Usage:

    from openqdc.datasets import ANI1CCX\ndataset = ANI1CCX()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX(ANI1):\n    \"\"\"\n    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active\n    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX\n    dataset = ANI1CCX()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_T_CBS,  # \"ccsd(t)/cbs\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVTZ,  # \"ccsd(t)/cc-pvtz\",\n        PotentialMethod.TCSSD_T_CC_PVDZ,  # \"tccsd(t)/cc-pvdz\",\n    ]\n\n    energy_target_names = [\n        \"CCSD(T)*:CBS Total Energy\",\n        \"NPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n        \"NPNO-CCSD(T):cc-pVTZ Correlation Energy\",\n        \"TPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n    ]\n    force_target_names = []\n    __links__ = {\"ani1x.hdf5.gz\": \"https://zenodo.org/record/4081694/files/292.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/ani.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX_V2","title":"ANI1CCX_V2","text":"

    Bases: ANI1CCX

    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.

    Usage:

    from openqdc.datasets import ANI1CCX_V2\ndataset = ANI1CCX_V2()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX_V2(ANI1CCX):\n    \"\"\"\n    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels\n    for each conformation.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX_V2\n    dataset = ANI1CCX_V2()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx_v2\"\n\n    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]\n    energy_target_names = ANI1CCX.energy_target_names + [\"PM6\", \"GFN2\"]\n    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1X","title":"ANI1X","text":"

    Bases: ANI1

    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.

    Usage:

    from openqdc.datasets import ANI1X\ndataset = ANI1X()\n

    References

    https://doi.org/10.1063/1.5023802

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1X(ANI1):\n    \"\"\"\n    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to\n    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,\n    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques\n    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and\n    (4) torsion sampling.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1X\n    dataset = ANI1X()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5023802\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.HF_CC_PVDZ,\n        PotentialMethod.HF_CC_PVQZ,\n        PotentialMethod.HF_CC_PVTZ,\n        PotentialMethod.MP2_CC_PVDZ,\n        PotentialMethod.MP2_CC_PVQZ,\n        PotentialMethod.MP2_CC_PVTZ,\n        PotentialMethod.WB97X_6_31G_D,\n        PotentialMethod.WB97X_CC_PVTZ,\n    ]\n\n    energy_target_names = [\n        \"HF:cc-pVDZ Total Energy\",\n        \"HF:cc-pVQZ Total Energy\",\n        \"HF:cc-pVTZ Total Energy\",\n        \"MP2:cc-pVDZ Correlation Energy\",\n        \"MP2:cc-pVQZ Correlation Energy\",\n        \"MP2:cc-pVTZ Correlation Energy\",\n        \"wB97x:6-31G(d) Total Energy\",\n        \"wB97x:def2-TZVPP Total Energy\",\n    ]\n\n    force_target_names = [\n        \"wB97x:6-31G(d) Atomic Forces\",\n        \"wB97x:def2-TZVPP Atomic Forces\",\n    ]\n\n    __force_mask__ = [False, False, False, False, False, False, True, True]\n    __links__ = {\"ani1ccx.hdf5.gz\": \"https://zenodo.org/record/4081692/files/293.hdf5.gz\"}\n\n    def convert_forces(self, x):\n        return super().convert_forces(x) * 0.529177249  # correct the Dataset error\n\n    def __smiles_converter__(self, x):\n        return x\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI2X","title":"ANI2X","text":"

    Bases: ANI1

    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.

    Usage:

    from openqdc.datasets import ANI2X\ndataset = ANI2X()\n

    References

    https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI2X(ANI1):\n    \"\"\"\n    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.\n    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized\n    using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are\n    used for generating geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI2X\n    dataset = ANI2X()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.0c00121\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani2x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        # PotentialMethod.NONE,  # \"b973c/def2mtzvp\",\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/631gd\", # PAPER DATASET\n        # PotentialMethod.NONE,  # \"wb97md3bj/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97mv/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97x/def2tzvpp\",\n    ]\n\n    energy_target_names = [\n        # \"b973c/def2mtzvp\",\n        \"wb97x/631gd\",\n        # \"wb97md3bj/def2tzvpp\",\n        # \"wb97mv/def2tzvpp\",\n        # \"wb97x/def2tzvpp\",\n    ]\n\n    force_target_names = [\"wb97x/631gd\"]  # \"b973c/def2mtzvp\",\n\n    __force_mask__ = [True]\n    __links__ = {  # \"ANI-2x-B973c-def2mTZVP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1\", # noqa\n        # \"ANI-2x-wB97MV-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1\", # noqa\n        \"ANI-2x-wB97X-631Gd.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97X-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1\", # noqa\n    }\n\n    def __smiles_converter__(self, x):\n        return x\n\n    def read_raw_entries(self):\n        samples = []\n        for lvl_theory in self.__links__.keys():\n            raw_path = p_join(self.root, \"final_h5\", f\"{lvl_theory.split('.')[0]}.h5\")\n            samples.extend(read_ani2_h5(raw_path))\n        return samples\n
    "},{"location":"API/datasets/comp6.html","title":"Comp6","text":""},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6","title":"COMP6","text":"

    Bases: BaseDataset

    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and molecular dipoles.

    Details of the benchmark sets are as follows

    S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and

    mixed influence interactions.

    ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n

    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point calculations are performed to calculate energies and forces.

    GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n

    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal mode sampling (DNMS) is carried out to generate non-equilibrium conformations.

    GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n

    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are generated via DNMS.

    Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n\nDrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n

    Structures are optimized similar to GDB7to9.

    Usage:

    from openqdc.datasets import COMP6\ndataset = COMP6()\n

    References

    https://aip.scitation.org/doi/abs/10.1063/1.5023802

    https://github.com/isayev/COMP6

    S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d

    GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/

    GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/

    DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h

    Source code in openqdc/datasets/potential/comp6.py
    class COMP6(BaseDataset):\n    \"\"\"\n    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the\n    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and\n    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using\n    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and\n    molecular dipoles.\n\n    Details of the benchmark sets are as follows:\n        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and\n    mixed influence interactions.\\n\n        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small\n    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point\n    calculations are performed to calculate energies and forces.\\n\n        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence\n    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal\n    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\\n\n        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are\n    generated via DNMS.\\n\n        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\\n\n        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n    Structures are optimized similar to GDB7to9.\n\n    Usage:\n    ```python\n    from openqdc.datasets import COMP6\n    dataset = COMP6()\n    ```\n\n    References:\n        https://aip.scitation.org/doi/abs/10.1063/1.5023802\\n\n        https://github.com/isayev/COMP6\\n\n        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\\n\n        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\\n\n        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\\n\n        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h\n    \"\"\"\n\n    __name__ = \"comp6\"\n\n    # watchout that forces are stored as -grad(E)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"  # angstorm\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g*\",\n        PotentialMethod.B3LYP_D3_BJ_DEF2_TZVP,  # \"b3lyp-d3(bj)/def2-tzvp\",\n        PotentialMethod.B3LYP_DEF2_TZVP,  # \"b3lyp/def2-tzvp\",\n        PotentialMethod.HF_DEF2_TZVP,  # \"hf/def2-tzvp\",\n        PotentialMethod.PBE_D3_BJ_DEF2_TZVP,  # \"pbe-d3(bj)/def2-tzvp\",\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n        PotentialMethod.SVWN_DEF2_TZVP,  # \"svwn/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"Energy\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP:def2-tzvp\",\n        \"HF:def2-tzvp\",\n        \"PBE-D3M(BJ):def2-tzvp\",\n        \"PBE:def2-tzvp\",\n        \"SVWN:def2-tzvp\",\n    ]\n    __force_mask__ = [True, False, False, False, False, False, False]\n\n    force_target_names = [\n        \"Gradient\",\n    ]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        samples = []\n        for subset in [\"ani_md\", \"drugbank\", \"gdb7_9\", \"gdb10_13\", \"s66x8\", \"tripeptides\"]:\n            raw_path = p_join(self.root, f\"{subset}.h5.gz\")\n            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/comp6.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/des.html","title":"DES","text":""},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES370K","title":"DES370K","text":"

    Bases: BaseInteractionDataset, IDES

    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.

    Usage:

    from openqdc.datasets import DES370K\ndataset = DES370K()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES370K(BaseInteractionDataset, IDES):\n    \"\"\"\n    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies\n    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules\n    and ions) including water and functional groups found in proteins. Dimer geometries are generated using\n    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES370K\n    dataset = DES370K()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des370k_interaction\"\n    __filename__ = \"DES370K.csv\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVDZ,\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_CC_PVDZ,\n        InteractionMethod.CCSD_T_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"cc_MP2_all\",\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"cc_CCSD(T)_all\",\n        \"cbs_CCSD(T)_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES370K.zip\": \"https://zenodo.org/record/5676266/files/DES370K.zip\",\n    }\n\n    @property\n    def csv_path(self):\n        return os.path.join(self.root, self.__filename__)\n\n    def _create_subsets(self, **kwargs):\n        return create_subset(kwargs[\"smiles0\"], kwargs[\"smiles1\"])\n\n    def read_raw_entries(self) -> List[Dict]:\n        filepath = self.csv_path\n        logger.info(f\"Reading {self.__name__} interaction data from {filepath}\")\n        df = pd.read_csv(filepath)\n        data = []\n        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):\n            item = parse_des_df(row, self.energy_target_names)\n            item[\"subset\"] = self._create_subsets(row=row, **item)\n            item = convert_to_record(item)\n            data.append(item)\n        return data\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES5M","title":"DES5M","text":"

    Bases: DES370K

    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.

    Usage:

    from openqdc.datasets import DES5M\ndataset = DES5M()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES5M(DES370K):\n    \"\"\"\n    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies\n    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using\n    QM based optimization and MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES5M\n    dataset = DES5M()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des5m_interaction\"\n    __filename__ = \"DES5M.csv\"\n\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES5M.zip\": \"https://zenodo.org/records/5706002/files/DESS5M.zip?download=1\",\n    }\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66","title":"DESS66","text":"

    Bases: DES370K

    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66\ndataset = DESS66()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    S66: https://pubs.acs.org/doi/10.1021/ct2002946

    Source code in openqdc/datasets/interaction/des.py
    class DESS66(DES370K):\n    \"\"\"\n    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.\n    The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66\n    dataset = DESS66()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\\n\n        S66: https://pubs.acs.org/doi/10.1021/ct2002946\n    \"\"\"\n\n    __name__ = \"des_s66\"\n    __filename__ = \"DESS66.csv\"\n    __links__ = {\"DESS66.zip\": \"https://zenodo.org/records/5676284/files/DESS66.zip?download=1\"}\n\n    def _create_subsets(self, **kwargs):\n        return kwargs[\"row\"][\"system_name\"]\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66x8","title":"DESS66x8","text":"

    Bases: DESS66

    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66x8\ndataset = DESS66x8()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DESS66x8(DESS66):\n    \"\"\"\n    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve\n    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66x8\n    dataset = DESS66x8()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des_s66x8\"\n    __filename__ = \"DESS66x8.csv\"\n    __links__ = {\"DESS66x8.zip\": \"https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1\"}\n
    "},{"location":"API/datasets/gdml.html","title":"GDML","text":""},{"location":"API/datasets/gdml.html#openqdc.datasets.potential.gdml.GDML","title":"GDML","text":"

    Bases: BaseDataset

    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for each conformation are computed using the PBE + vdW-TS electronic structure method. molecular dynamics (AIMD) trajectories.

    The dataset consists of the following trajectories

    Benzene: 627000 samples

    Uracil: 133000 samples

    Naptalene: 326000 samples

    Aspirin: 211000 samples

    Salicylic Acid: 320000 samples

    Malonaldehyde: 993000 samples

    Ethanol: 555000 samples

    Toluene: 100000 samples

    Usage:

    from openqdc.datasets import GDML\ndataset = GDML()\n

    References

    https://www.science.org/doi/10.1126/sciadv.1603015 http://www.sgdml.org/#datasets

    Source code in openqdc/datasets/potential/gdml.py
    class GDML(BaseDataset):\n    \"\"\"\n    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio\n    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene\n    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin\n    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),\n    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for\n    each conformation are computed using the PBE + vdW-TS electronic structure method.\n    molecular dynamics (AIMD) trajectories.\n\n    The dataset consists of the following trajectories:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import GDML\n    dataset = GDML()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.1603015\n        http://www.sgdml.org/#datasets\n    \"\"\"\n\n    __name__ = \"gdml\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_CC_PVDZ,  # \"ccsd/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        # TODO: verify if basis set vdw-ts == def2-tzvp and\n        # it is the same in ISO17 and revmd17\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",  # MD17\n    ]\n\n    energy_target_names = [\n        \"CCSD Energy\",\n        \"CCSD(T) Energy\",\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True, True, True]\n\n    force_target_names = [\n        \"CCSD Gradient\",\n        \"CCSD(T) Gradient\",\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __links__ = {\n        \"gdb7_9.hdf5.gz\": \"https://zenodo.org/record/3588361/files/208.hdf5.gz\",\n        \"gdb10_13.hdf5.gz\": \"https://zenodo.org/record/3588364/files/209.hdf5.gz\",\n        \"drugbank.hdf5.gz\": \"https://zenodo.org/record/3588361/files/207.hdf5.gz\",\n        \"tripeptides.hdf5.gz\": \"https://zenodo.org/record/3588368/files/211.hdf5.gz\",\n        \"ani_md.hdf5.gz\": \"https://zenodo.org/record/3588341/files/205.hdf5.gz\",\n        \"s66x8.hdf5.gz\": \"https://zenodo.org/record/3588367/files/210.hdf5.gz\",\n    }\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"gdml.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"gdml\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/geom.html","title":"GEOM","text":"

    Bases: BaseDataset

    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.

    Usage:

    from openqdc.datasets import GEOM\ndataset = GEOM()\n

    References

    https://www.nature.com/articles/s41597-022-01288-4

    https://github.com/learningmatter-mit/geom

    CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d

    Source code in openqdc/datasets/potential/geom.py
    class GEOM(BaseDataset):\n    \"\"\"\n    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules\n    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.\n    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and\n    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the\n    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.\n\n    Usage:\n    ```python\n    from openqdc.datasets import GEOM\n    dataset = GEOM()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-022-01288-4\\n\n        https://github.com/learningmatter-mit/geom\\n\n        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d\n    \"\"\"\n\n    __name__ = \"geom\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    energy_target_names = [\"gfn2_xtb.energy\"]\n    force_target_names = []\n\n    partitions = [\"qm9\", \"drugs\"]\n    __links__ = {\"rdkit_folder.tar.gz\": \"https://dataverse.harvard.edu/api/access/datafile/4327252\"}\n\n    def _read_raw_(self, partition):\n        raw_path = p_join(self.root, \"rdkit_folder\")\n\n        mols = load_json(p_join(raw_path, f\"summary_{partition}.json\"))\n        mols = list(mols.items())\n\n        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731\n        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job\n        return samples\n\n    def read_raw_entries(self):\n        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])\n        return samples\n
    "},{"location":"API/datasets/iso_17.html","title":"ISO_17","text":""},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17","title":"ISO17","text":"

    Bases: BaseDataset

    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.

    Usage:

    from openqdc.datasets import ISO17\ndataset = ISO17()\n

    References

    https://arxiv.org/abs/1706.08566

    https://arxiv.org/abs/1609.08259

    https://www.nature.com/articles/sdata201422

    https://pubmed.ncbi.nlm.nih.gov/10062328/

    https://pubmed.ncbi.nlm.nih.gov/19257665/

    Source code in openqdc/datasets/potential/iso_17.py
    class ISO17(BaseDataset):\n    \"\"\"\n    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of\n    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing\n    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics\n    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient\n    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der\n    Waals correction method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ISO17\n    dataset = ISO17()\n    ```\n\n    References:\n        https://arxiv.org/abs/1706.08566\\n\n        https://arxiv.org/abs/1609.08259\\n\n        https://www.nature.com/articles/sdata201422\\n\n        https://pubmed.ncbi.nlm.nih.gov/10062328/\\n\n        https://pubmed.ncbi.nlm.nih.gov/19257665/\n    \"\"\"\n\n    __name__ = \"iso_17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"iso_17.hdf5.gz\": \"https://zenodo.org/record/3585907/files/216.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"iso_17.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"iso_17\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/iso_17.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/l7.html","title":"L7","text":""},{"location":"API/datasets/l7.html#openqdc.datasets.interaction.l7.L7","title":"L7","text":"

    Bases: YamlDataset

    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are taken from crystal X-ray data and optimized with a DFT method specific to the complex.

    Usage:

    from openqdc.datasets import L7\ndataset = L7()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct400036b

    Source code in openqdc/datasets/interaction/l7.py
    class L7(YamlDataset):\n    \"\"\"\n    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with\n    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are\n    taken from crystal X-ray data and optimized with a DFT method specific to the complex.\n\n    Usage:\n    ```python\n    from openqdc.datasets import L7\n    dataset = L7()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct400036b\n    \"\"\"\n\n    __name__ = \"l7\"\n    __energy_methods__ = [\n        InteractionMethod.QCISDT_CBS,  # \"QCISD(T)/CBS\",\n        InteractionMethod.DLPNO_CCSDT,  # \"DLPNO-CCSD(T)\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.MP2C_CBS,  # \"MP2C/CBS\",\n        InteractionMethod.FIXED,  # \"fixed\", TODO: we should remove this level of theory because unless we have a pro\n        InteractionMethod.DLPNO_CCSDT0,  # \"DLPNO-CCSD(T0)\",\n        InteractionMethod.LNO_CCSDT,  # \"LNO-CCSD(T)\",\n        InteractionMethod.FN_DMC,  # \"FN-DMC\",\n    ]\n    __links__ = {\n        \"l7.yaml\": \"http://cuby4.molecular.cz/download_datasets/l7.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/L7.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.geometry.split(\":\")[1]\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        return np.array([int(item.setup[\"molecule_a\"][\"selection\"].split(\"-\")[1])], dtype=np.int32)\n
    "},{"location":"API/datasets/md22.html","title":"MD22","text":""},{"location":"API/datasets/md22.html#openqdc.datasets.potential.md22.MD22","title":"MD22","text":"

    Bases: RevMD17

    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD level of theory.

    Usage:

    from openqdc.datasets import MD22\ndataset = MD22()\n

    Reference

    https://arxiv.org/abs/2209.14865

    Source code in openqdc/datasets/potential/md22.py
    class MD22(RevMD17):\n    \"\"\"\n    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,\n    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories\n    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD\n    level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MD22\n    dataset = MD22()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2209.14865\n    \"\"\"\n\n    __name__ = \"md22\"\n    __links__ = {\n        f\"{x}.npz\": f\"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz\"\n        for x in [\n            \"Ac-Ala3-NHMe\",\n            \"DHA\",\n            \"stachyose\",\n            \"AT-AT\",\n            \"AT-AT-CG-CG\",\n            \"double-walled_nanotube\",\n            \"buckyball-catcher\",\n        ]\n    }\n\n    def read_raw_entries(self):\n        entries_list = []\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/metcalf.html","title":"Metcalf","text":""},{"location":"API/datasets/metcalf.html#openqdc.datasets.interaction.metcalf.Metcalf","title":"Metcalf","text":"

    Bases: BaseInteractionDataset

    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various components.

    Usage:

    from openqdc.datasets import Metcalf\ndataset = Metcalf()\n

    Reference

    https://doi.org/10.1063/1.5142636

    Source code in openqdc/datasets/interaction/metcalf.py
    class Metcalf(BaseInteractionDataset):\n    \"\"\"\n    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to\n    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and\n    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various\n    components.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Metcalf\n    dataset = Metcalf()\n    ```\n\n    Reference:\n        https://doi.org/10.1063/1.5142636\n    \"\"\"\n\n    __name__ = \"metcalf\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n    ]\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = [\n        \"total energy\",\n        \"electrostatic energy\",\n        \"exchange energy\",\n        \"induction energy\",\n        \"dispersion energy\",\n    ]\n    __links__ = {\"model-data.tar.gz\": \"https://zenodo.org/records/10934211/files/model-data.tar?download=1\"}\n\n    def read_raw_entries(self) -> List[Dict]:\n        # extract in folders\n        extract_raw_tar_gz(self.root)\n        data = []\n        for filename in glob(self.root + f\"{os.sep}*.xyz\"):\n            data.extend(read_xyz(filename, self.__name__))\n        return data\n
    "},{"location":"API/datasets/molecule3d.html","title":"Molecule3D","text":""},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.Molecule3D","title":"Molecule3D","text":"

    Bases: BaseDataset

    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.

    Usage:

    from openqdc.datasets import Molecule3D\ndataset = Molecule3D()\n

    References

    https://arxiv.org/abs/2110.01717

    https://github.com/divelab/MoleculeX

    Source code in openqdc/datasets/potential/molecule3d.py
    class Molecule3D(BaseDataset):\n    \"\"\"\n    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the\n    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing\n    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,\n    or with damaged log files.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Molecule3D\n    dataset = Molecule3D()\n    ```\n\n    References:\n        https://arxiv.org/abs/2110.01717\\n\n        https://github.com/divelab/MoleculeX\n    \"\"\"\n\n    __name__ = \"molecule3d\"\n    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # \"b3lyp/6-31g*\",\n    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY\n    __energy_unit__ = \"ev\"  # CALCULATED\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"molecule3d.zip\": \"https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy\"}\n\n    energy_target_names = [\"b3lyp/6-31g*.energy\"]\n\n    def read_raw_entries(self):\n        raw = p_join(self.root, \"data\", \"raw\")\n        sdf_paths = glob(p_join(raw, \"*.sdf\"))\n        properties_path = p_join(raw, \"properties.csv\")\n\n        fn = lambda x: _read_sdf(x, properties_path)\n        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol","title":"read_mol(mol, energy)","text":"

    Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--parameters","title":"Parameters","text":"

    mol: Chem.rdchem.Mol RDKit molecule energy: float Energy of the molecule

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--returns","title":"Returns","text":"

    res: dict Dictionary containing the following keys: - name: np.ndarray of shape (N,) containing the smiles of the molecule - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions - energies: np.ndarray of shape (1,) containing the energy of the conformer - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer - subset: np.ndarray of shape (1) containing \"molecule3d\"

    Source code in openqdc/datasets/potential/molecule3d.py
    def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:\n    \"\"\"Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies\n\n    Parameters\n    ----------\n    mol: Chem.rdchem.Mol\n        RDKit molecule\n    energy: float\n        Energy of the molecule\n\n    Returns\n    -------\n    res: dict\n        Dictionary containing the following keys:\n        - name: np.ndarray of shape (N,) containing the smiles of the molecule\n        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions\n        - energies: np.ndarray of shape (1,) containing the energy of the conformer\n        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer\n        - subset: np.ndarray of shape (1) containing \"molecule3d\"\n    \"\"\"\n    smiles = dm.to_smiles(mol, explicit_hs=False)\n    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)\n    x = get_atomic_number_and_charge(mol)\n    positions = mol.GetConformer().GetPositions()\n\n    res = dict(\n        name=np.array([smiles]),\n        subset=np.array([\"molecule3d\"]),\n        energies=np.array([energy]).astype(np.float64)[:, None],\n        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),\n        n_atoms=np.array([x.shape[0]], dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/multixcqm9.html","title":"MultixcQM9","text":""},{"location":"API/datasets/multixcqm9.html#openqdc.datasets.potential.multixcqm9.MultixcQM9","title":"MultixcQM9","text":"

    Bases: BaseDataset

    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the molecules are used directly from Kim et al. which uses G4MP2 method.

    Usage:

    from openqdc.datasets import MultixcQM9\ndataset = MultixcQM9()\n

    References

    https://www.nature.com/articles/s41597-023-02690-2

    https://github.com/chemsurajit/largeDFTdata

    https://www.nature.com/articles/s41597-019-0121-7

    Source code in openqdc/datasets/potential/multixcqm9.py
    class MultixcQM9(BaseDataset):\n    \"\"\"\n    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting\n    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets\n    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the\n    molecules are used directly from Kim et al. which uses G4MP2 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MultixcQM9\n    dataset = MultixcQM9()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-023-02690-2\\n\n        https://github.com/chemsurajit/largeDFTdata\\n\n        https://www.nature.com/articles/s41597-019-0121-7\\n\n    \"\"\"\n\n    __name__ = \"multixcqm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.KCIS_MODIFIED_DZP,\n        PotentialMethod.KCIS_ORIGINAL_DZP,\n        PotentialMethod.PKZB_DZP,\n        PotentialMethod.VS98_DZP,\n        PotentialMethod.LDA_VWN_DZP,\n        PotentialMethod.PW91_DZP,\n        PotentialMethod.BLYP_DZP,\n        PotentialMethod.BP_DZP,\n        PotentialMethod.PBE_DZP,\n        PotentialMethod.RPBE_DZP,\n        PotentialMethod.REVPBE_DZP,\n        PotentialMethod.OLYP_DZP,\n        PotentialMethod.FT97_DZP,\n        PotentialMethod.BLAP3_DZP,\n        PotentialMethod.HCTH_93_DZP,\n        PotentialMethod.HCTH_120_DZP,\n        PotentialMethod.HCTH_147_DZP,\n        PotentialMethod.HCTH_407_DZP,\n        PotentialMethod.BMTAU1_DZP,\n        PotentialMethod.BOP_DZP,\n        PotentialMethod.PKZBX_KCISCOR_DZP,\n        PotentialMethod.VS98_X_XC_DZP,\n        PotentialMethod.VS98_X_ONLY_DZP,\n        PotentialMethod.BECKE00_DZP,\n        PotentialMethod.BECKE00X_XC_DZP,\n        PotentialMethod.BECKE00_X_ONLY_DZP,\n        PotentialMethod.BECKE88X_BR89C_DZP,\n        PotentialMethod.OLAP3_DZP,\n        PotentialMethod.TPSS_DZP,\n        PotentialMethod.MPBE_DZP,\n        PotentialMethod.OPBE_DZP,\n        PotentialMethod.OPERDEW_DZP,\n        PotentialMethod.MPBEKCIS_DZP,\n        PotentialMethod.MPW_DZP,\n        PotentialMethod.TAU_HCTH_DZP,\n        PotentialMethod.XLYP_DZP,\n        PotentialMethod.KT1_DZP,\n        PotentialMethod.KT2_DZP,\n        PotentialMethod.M06_L_DZP,\n        PotentialMethod.BLYP_D_DZP,\n        PotentialMethod.BP86_D_DZP,\n        PotentialMethod.PBE_D_DZP,\n        PotentialMethod.TPSSD_DZP,\n        PotentialMethod.B97_D_DZP,\n        PotentialMethod.REVTPSS_DZP,\n        PotentialMethod.PBESOL_DZP,\n        PotentialMethod.RGE2_DZP,\n        PotentialMethod.SSB_D_DZP,\n        PotentialMethod.MVS_DZP,\n        PotentialMethod.MVSX_DZP,\n        PotentialMethod.TMGGA_DZP,\n        PotentialMethod.TPSSH_DZP,\n        PotentialMethod.B3LYP_VWN5_DZP,\n        PotentialMethod.O3LYP_VWN5_DZP,\n        PotentialMethod.KMLYP_VWN5_DZP,\n        PotentialMethod.PBE0_DZP,\n        PotentialMethod.B3LYP_S_VWN5_DZP,\n        PotentialMethod.BHANDH_DZP,\n        PotentialMethod.BHANDHLYP_DZP,\n        PotentialMethod.B97_DZP,\n        PotentialMethod.B97_1_DZP,\n        PotentialMethod.B97_2_DZP,\n        PotentialMethod.MPBE0KCIS_DZP,\n        PotentialMethod.MPBE1KCIS_DZP,\n        PotentialMethod.B1LYP_VWN5_DZP,\n        PotentialMethod.B1PW91_VWN5_DZP,\n        PotentialMethod.MPW1PW_DZP,\n        PotentialMethod.MPW1K_DZP,\n        PotentialMethod.TAU_HCTH_HYBRID_DZP,\n        PotentialMethod.X3LYP_VWN5_DZP,\n        PotentialMethod.OPBE0_DZP,\n        PotentialMethod.M05_DZP,\n        PotentialMethod.M05_2X_DZP,\n        PotentialMethod.M06_DZP,\n        PotentialMethod.M06_2X_DZP,\n        PotentialMethod.B3LYP_D_DZP,\n        PotentialMethod.KCIS_MODIFIED_TZP,\n        PotentialMethod.KCIS_ORIGINAL_TZP,\n        PotentialMethod.PKZB_TZP,\n        PotentialMethod.VS98_TZP,\n        PotentialMethod.LDA_VWN_TZP,\n        PotentialMethod.PW91_TZP,\n        PotentialMethod.BLYP_TZP,\n        PotentialMethod.BP_TZP,\n        PotentialMethod.PBE_TZP,\n        PotentialMethod.RPBE_TZP,\n        PotentialMethod.REVPBE_TZP,\n        PotentialMethod.OLYP_TZP,\n        PotentialMethod.FT97_TZP,\n        PotentialMethod.BLAP3_TZP,\n        PotentialMethod.HCTH_93_TZP,\n        PotentialMethod.HCTH_120_TZP,\n        PotentialMethod.HCTH_147_TZP,\n        PotentialMethod.HCTH_407_TZP,\n        PotentialMethod.BMTAU1_TZP,\n        PotentialMethod.BOP_TZP,\n        PotentialMethod.PKZBX_KCISCOR_TZP,\n        PotentialMethod.VS98_X_XC_TZP,\n        PotentialMethod.VS98_X_ONLY_TZP,\n        PotentialMethod.BECKE00_TZP,\n        PotentialMethod.BECKE00X_XC_TZP,\n        PotentialMethod.BECKE00_X_ONLY_TZP,\n        PotentialMethod.BECKE88X_BR89C_TZP,\n        PotentialMethod.OLAP3_TZP,\n        PotentialMethod.TPSS_TZP,\n        PotentialMethod.MPBE_TZP,\n        PotentialMethod.OPBE_TZP,\n        PotentialMethod.OPERDEW_TZP,\n        PotentialMethod.MPBEKCIS_TZP,\n        PotentialMethod.MPW_TZP,\n        PotentialMethod.TAU_HCTH_TZP,\n        PotentialMethod.XLYP_TZP,\n        PotentialMethod.KT1_TZP,\n        PotentialMethod.KT2_TZP,\n        PotentialMethod.M06_L_TZP,\n        PotentialMethod.BLYP_D_TZP,\n        PotentialMethod.BP86_D_TZP,\n        PotentialMethod.PBE_D_TZP,\n        PotentialMethod.TPSSD_TZP,\n        PotentialMethod.B97_D_TZP,\n        PotentialMethod.REVTPSS_TZP,\n        PotentialMethod.PBESOL_TZP,\n        PotentialMethod.RGE2_TZP,\n        PotentialMethod.SSB_D_TZP,\n        PotentialMethod.MVS_TZP,\n        PotentialMethod.MVSX_TZP,\n        PotentialMethod.TMGGA_TZP,\n        PotentialMethod.TPSSH_TZP,\n        PotentialMethod.B3LYP_VWN5_TZP,\n        PotentialMethod.O3LYP_VWN5_TZP,\n        PotentialMethod.KMLYP_VWN5_TZP,\n        PotentialMethod.PBE0_TZP,\n        PotentialMethod.B3LYP_S_VWN5_TZP,\n        PotentialMethod.BHANDH_TZP,\n        PotentialMethod.BHANDHLYP_TZP,\n        PotentialMethod.B97_TZP,\n        PotentialMethod.B97_1_TZP,\n        PotentialMethod.B97_2_TZP,\n        PotentialMethod.MPBE0KCIS_TZP,\n        PotentialMethod.MPBE1KCIS_TZP,\n        PotentialMethod.B1LYP_VWN5_TZP,\n        PotentialMethod.B1PW91_VWN5_TZP,\n        PotentialMethod.MPW1PW_TZP,\n        PotentialMethod.MPW1K_TZP,\n        PotentialMethod.TAU_HCTH_HYBRID_TZP,\n        PotentialMethod.X3LYP_VWN5_TZP,\n        PotentialMethod.OPBE0_TZP,\n        PotentialMethod.M05_TZP,\n        PotentialMethod.M05_2X_TZP,\n        PotentialMethod.M06_TZP,\n        PotentialMethod.M06_2X_TZP,\n        PotentialMethod.B3LYP_D_TZP,\n        PotentialMethod.KCIS_MODIFIED_SZ,\n        PotentialMethod.KCIS_ORIGINAL_SZ,\n        PotentialMethod.PKZB_SZ,\n        PotentialMethod.VS98_SZ,\n        PotentialMethod.LDA_VWN_SZ,\n        PotentialMethod.PW91_SZ,\n        PotentialMethod.BLYP_SZ,\n        PotentialMethod.BP_SZ,\n        PotentialMethod.PBE_SZ,\n        PotentialMethod.RPBE_SZ,\n        PotentialMethod.REVPBE_SZ,\n        PotentialMethod.OLYP_SZ,\n        PotentialMethod.FT97_SZ,\n        PotentialMethod.BLAP3_SZ,\n        PotentialMethod.HCTH_93_SZ,\n        PotentialMethod.HCTH_120_SZ,\n        PotentialMethod.HCTH_147_SZ,\n        PotentialMethod.HCTH_407_SZ,\n        PotentialMethod.BMTAU1_SZ,\n        PotentialMethod.BOP_SZ,\n        PotentialMethod.PKZBX_KCISCOR_SZ,\n        PotentialMethod.VS98_X_XC_SZ,\n        PotentialMethod.VS98_X_ONLY_SZ,\n        PotentialMethod.BECKE00_SZ,\n        PotentialMethod.BECKE00X_XC_SZ,\n        PotentialMethod.BECKE00_X_ONLY_SZ,\n        PotentialMethod.BECKE88X_BR89C_SZ,\n        PotentialMethod.OLAP3_SZ,\n        PotentialMethod.TPSS_SZ,\n        PotentialMethod.MPBE_SZ,\n        PotentialMethod.OPBE_SZ,\n        PotentialMethod.OPERDEW_SZ,\n        PotentialMethod.MPBEKCIS_SZ,\n        PotentialMethod.MPW_SZ,\n        PotentialMethod.TAU_HCTH_SZ,\n        PotentialMethod.XLYP_SZ,\n        PotentialMethod.KT1_SZ,\n        PotentialMethod.KT2_SZ,\n        PotentialMethod.M06_L_SZ,\n        PotentialMethod.BLYP_D_SZ,\n        PotentialMethod.BP86_D_SZ,\n        PotentialMethod.PBE_D_SZ,\n        PotentialMethod.TPSSD_SZ,\n        PotentialMethod.B97_D_SZ,\n        PotentialMethod.REVTPSS_SZ,\n        PotentialMethod.PBESOL_SZ,\n        PotentialMethod.RGE2_SZ,\n        PotentialMethod.SSB_D_SZ,\n        PotentialMethod.MVS_SZ,\n        PotentialMethod.MVSX_SZ,\n        PotentialMethod.TMGGA_SZ,\n        PotentialMethod.TPSSH_SZ,\n        PotentialMethod.B3LYP_VWN5_SZ,\n        PotentialMethod.O3LYP_VWN5_SZ,\n        PotentialMethod.KMLYP_VWN5_SZ,\n        PotentialMethod.PBE0_SZ,\n        PotentialMethod.B3LYP_S_VWN5_SZ,\n        PotentialMethod.BHANDH_SZ,\n        PotentialMethod.BHANDHLYP_SZ,\n        PotentialMethod.B97_SZ,\n        PotentialMethod.B97_1_SZ,\n        PotentialMethod.B97_2_SZ,\n        PotentialMethod.MPBE0KCIS_SZ,\n        PotentialMethod.MPBE1KCIS_SZ,\n        PotentialMethod.B1LYP_VWN5_SZ,\n        PotentialMethod.B1PW91_VWN5_SZ,\n        PotentialMethod.MPW1PW_SZ,\n        PotentialMethod.MPW1K_SZ,\n        PotentialMethod.TAU_HCTH_HYBRID_SZ,\n        PotentialMethod.X3LYP_VWN5_SZ,\n        PotentialMethod.OPBE0_SZ,\n        PotentialMethod.M05_SZ,\n        PotentialMethod.M05_2X_SZ,\n        PotentialMethod.M06_SZ,\n        PotentialMethod.M06_2X_SZ,\n        PotentialMethod.B3LYP_D_SZ,\n        PotentialMethod.GFN2_XTB,\n    ]\n\n    energy_target_names = [\n        \"KCIS-MODIFIED/DZP\",\n        \"KCIS-ORIGINAL/DZP\",\n        \"PKZB/DZP\",\n        \"VS98/DZP\",\n        \"LDA(VWN)/DZP\",\n        \"PW91/DZP\",\n        \"BLYP/DZP\",\n        \"BP/DZP\",\n        \"PBE/DZP\",\n        \"RPBE/DZP\",\n        \"REVPBE/DZP\",\n        \"OLYP/DZP\",\n        \"FT97/DZP\",\n        \"BLAP3/DZP\",\n        \"HCTH/93/DZP\",\n        \"HCTH/120/DZP\",\n        \"HCTH/147/DZP\",\n        \"HCTH/407/DZP\",\n        \"BMTAU1/DZP\",\n        \"BOP/DZP\",\n        \"PKZBX-KCISCOR/DZP\",\n        \"VS98-X(XC)/DZP\",\n        \"VS98-X-ONLY/DZP\",\n        \"BECKE00/DZP\",\n        \"BECKE00X(XC)/DZP\",\n        \"BECKE00-X-ONLY/DZP\",\n        \"BECKE88X+BR89C/DZP\",\n        \"OLAP3/DZP\",\n        \"TPSS/DZP\",\n        \"MPBE/DZP\",\n        \"OPBE/DZP\",\n        \"OPERDEW/DZP\",\n        \"MPBEKCIS/DZP\",\n        \"MPW/DZP\",\n        \"TAU-HCTH/DZP\",\n        \"XLYP/DZP\",\n        \"KT1/DZP\",\n        \"KT2/DZP\",\n        \"M06-L/DZP\",\n        \"BLYP-D/DZP\",\n        \"BP86-D/DZP\",\n        \"PBE-D/DZP\",\n        \"TPSS-D/DZP\",\n        \"B97-D/DZP\",\n        \"REVTPSS/DZP\",\n        \"PBESOL/DZP\",\n        \"RGE2/DZP\",\n        \"SSB-D/DZP\",\n        \"MVS/DZP\",\n        \"MVSX/DZP\",\n        \"T-MGGA/DZP\",\n        \"TPSSH/DZP\",\n        \"B3LYP(VWN5)/DZP\",\n        \"O3LYP(VWN5)/DZP\",\n        \"KMLYP(VWN5)/DZP\",\n        \"PBE0/DZP\",\n        \"B3LYP*(VWN5)/DZP\",\n        \"BHANDH/DZP\",\n        \"BHANDHLYP/DZP\",\n        \"B97/DZP\",\n        \"B97-1/DZP\",\n        \"B97-2/DZP\",\n        \"MPBE0KCIS/DZP\",\n        \"MPBE1KCIS/DZP\",\n        \"B1LYP(VWN5)/DZP\",\n        \"B1PW91(VWN5)/DZP\",\n        \"MPW1PW/DZP\",\n        \"MPW1K/DZP\",\n        \"TAU-HCTH-HYBRID/DZP\",\n        \"X3LYP(VWN5)/DZP\",\n        \"OPBE0/DZP\",\n        \"M05/DZP\",\n        \"M05-2X/DZP\",\n        \"M06/DZP\",\n        \"M06-2X/DZP\",\n        \"B3LYP-D/DZP\",\n        \"KCIS-MODIFIED/TZP\",\n        \"KCIS-ORIGINAL/TZP\",\n        \"PKZB/TZP\",\n        \"VS98/TZP\",\n        \"LDA(VWN)/TZP\",\n        \"PW91/TZP\",\n        \"BLYP/TZP\",\n        \"BP/TZP\",\n        \"PBE/TZP\",\n        \"RPBE/TZP\",\n        \"REVPBE/TZP\",\n        \"OLYP/TZP\",\n        \"FT97/TZP\",\n        \"BLAP3/TZP\",\n        \"HCTH/93/TZP\",\n        \"HCTH/120/TZP\",\n        \"HCTH/147/TZP\",\n        \"HCTH/407/TZP\",\n        \"BMTAU1/TZP\",\n        \"BOP/TZP\",\n        \"PKZBX-KCISCOR/TZP\",\n        \"VS98-X(XC)/TZP\",\n        \"VS98-X-ONLY/TZP\",\n        \"BECKE00/TZP\",\n        \"BECKE00X(XC)/TZP\",\n        \"BECKE00-X-ONLY/TZP\",\n        \"BECKE88X+BR89C/TZP\",\n        \"OLAP3/TZP\",\n        \"TPSS/TZP\",\n        \"MPBE/TZP\",\n        \"OPBE/TZP\",\n        \"OPERDEW/TZP\",\n        \"MPBEKCIS/TZP\",\n        \"MPW/TZP\",\n        \"TAU-HCTH/TZP\",\n        \"XLYP/TZP\",\n        \"KT1/TZP\",\n        \"KT2/TZP\",\n        \"M06-L/TZP\",\n        \"BLYP-D/TZP\",\n        \"BP86-D/TZP\",\n        \"PBE-D/TZP\",\n        \"TPSS-D/TZP\",\n        \"B97-D/TZP\",\n        \"REVTPSS/TZP\",\n        \"PBESOL/TZP\",\n        \"RGE2/TZP\",\n        \"SSB-D/TZP\",\n        \"MVS/TZP\",\n        \"MVSX/TZP\",\n        \"T-MGGA/TZP\",\n        \"TPSSH/TZP\",\n        \"B3LYP(VWN5)/TZP\",\n        \"O3LYP(VWN5)/TZP\",\n        \"KMLYP(VWN5)/TZP\",\n        \"PBE0/TZP\",\n        \"B3LYP*(VWN5)/TZP\",\n        \"BHANDH/TZP\",\n        \"BHANDHLYP/TZP\",\n        \"B97/TZP\",\n        \"B97-1/TZP\",\n        \"B97-2/TZP\",\n        \"MPBE0KCIS/TZP\",\n        \"MPBE1KCIS/TZP\",\n        \"B1LYP(VWN5)/TZP\",\n        \"B1PW91(VWN5)/TZP\",\n        \"MPW1PW/TZP\",\n        \"MPW1K/TZP\",\n        \"TAU-HCTH-HYBRID/TZP\",\n        \"X3LYP(VWN5)/TZP\",\n        \"OPBE0/TZP\",\n        \"M05/TZP\",\n        \"M05-2X/TZP\",\n        \"M06/TZP\",\n        \"M06-2X/TZP\",\n        \"B3LYP-D/TZP\",\n        \"KCIS-MODIFIED/SZ\",\n        \"KCIS-ORIGINAL/SZ\",\n        \"PKZB/SZ\",\n        \"VS98/SZ\",\n        \"LDA(VWN)/SZ\",\n        \"PW91/SZ\",\n        \"BLYP/SZ\",\n        \"BP/SZ\",\n        \"PBE/SZ\",\n        \"RPBE/SZ\",\n        \"REVPBE/SZ\",\n        \"OLYP/SZ\",\n        \"FT97/SZ\",\n        \"BLAP3/SZ\",\n        \"HCTH/93/SZ\",\n        \"HCTH/120/SZ\",\n        \"HCTH/147/SZ\",\n        \"HCTH/407/SZ\",\n        \"BMTAU1/SZ\",\n        \"BOP/SZ\",\n        \"PKZBX-KCISCOR/SZ\",\n        \"VS98-X(XC)/SZ\",\n        \"VS98-X-ONLY/SZ\",\n        \"BECKE00/SZ\",\n        \"BECKE00X(XC)/SZ\",\n        \"BECKE00-X-ONLY/SZ\",\n        \"BECKE88X+BR89C/SZ\",\n        \"OLAP3/SZ\",\n        \"TPSS/SZ\",\n        \"MPBE/SZ\",\n        \"OPBE/SZ\",\n        \"OPERDEW/SZ\",\n        \"MPBEKCIS/SZ\",\n        \"MPW/SZ\",\n        \"TAU-HCTH/SZ\",\n        \"XLYP/SZ\",\n        \"KT1/SZ\",\n        \"KT2/SZ\",\n        \"M06-L/SZ\",\n        \"BLYP-D/SZ\",\n        \"BP86-D/SZ\",\n        \"PBE-D/SZ\",\n        \"TPSS-D/SZ\",\n        \"B97-D/SZ\",\n        \"REVTPSS/SZ\",\n        \"PBESOL/SZ\",\n        \"RGE2/SZ\",\n        \"SSB-D/SZ\",\n        \"MVS/SZ\",\n        \"MVSX/SZ\",\n        \"T-MGGA/SZ\",\n        \"TPSSH/SZ\",\n        \"B3LYP(VWN5)/SZ\",\n        \"O3LYP(VWN5)/SZ\",\n        \"KMLYP(VWN5)/SZ\",\n        \"PBE0/SZ\",\n        \"B3LYP*(VWN5)/SZ\",\n        \"BHANDH/SZ\",\n        \"BHANDHLYP/SZ\",\n        \"B97/SZ\",\n        \"B97-1/SZ\",\n        \"B97-2/SZ\",\n        \"MPBE0KCIS/SZ\",\n        \"MPBE1KCIS/SZ\",\n        \"B1LYP(VWN5)/SZ\",\n        \"B1PW91(VWN5)/SZ\",\n        \"MPW1PW/SZ\",\n        \"MPW1K/SZ\",\n        \"TAU-HCTH-HYBRID/SZ\",\n        \"X3LYP(VWN5)/SZ\",\n        \"OPBE0/SZ\",\n        \"M05/SZ\",\n        \"M05-2X/SZ\",\n        \"M06/SZ\",\n        \"M06-2X/SZ\",\n        \"B3LYP-D/SZ\",\n        \"GFNXTB\",\n    ]\n\n    __energy_unit__ = \"ev\"  # to fix\n    __distance_unit__ = \"ang\"  # to fix\n    __forces_unit__ = \"ev/ang\"  # to fix\n    __links__ = {\n        \"xyz.zip\": \"https://data.dtu.dk/ndownloader/files/35143624\",\n        \"xtb.zip\": \"https://data.dtu.dk/ndownloader/files/42444300\",\n        \"dzp.zip\": \"https://data.dtu.dk/ndownloader/files/42443925\",\n        \"tzp.zip\": \"https://data.dtu.dk/ndownloader/files/42444129\",\n        \"sz.zip\": \"https://data.dtu.dk/ndownloader/files/42441345\",\n        \"failed_indices.dat\": \"https://data.dtu.dk/ndownloader/files/37337677\",\n    }\n\n    def _read_molecules_energies(self):\n        d = {\"DZP\": None, \"TZP\": None, \"SZ\": None, \"XTB\": None}\n        for basis in d.keys():\n            d[basis] = pd.read_csv(p_join(self.root, basis, \"molecules/molecules.csv\"), index_col=False).drop(\n                columns=[\"index\"]\n            )\n        return pd.concat([d[\"DZP\"], d[\"TZP\"], d[\"SZ\"], d[\"XTB\"]], axis=1, ignore_index=False)\n\n    def _read_all_xyzs(self):\n        xyz_list = read_xyz_files(self.root)\n        return pd.DataFrame(xyz_list)\n\n    def read_raw_entries(self):\n        df_energies = self._read_molecules_energies()\n        df_xyz = self._read_all_xyzs()\n        return [\n            {\"energies\": np.atleast_2d(en), **xyz_dict}\n            for xyz_dict, en in zip(df_xyz.to_dict(\"records\"), df_energies.values.astype(np.float64))\n        ]\n
    "},{"location":"API/datasets/nabladft.html","title":"NablaDFT","text":""},{"location":"API/datasets/nabladft.html#openqdc.datasets.potential.nabladft.NablaDFT","title":"NablaDFT","text":"

    Bases: BaseDataset

    NablaDFT is a dataset constructed from a subset of the Molecular Sets (MOSES) dataset consisting of 1 million molecules with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at wB97X-D/def2-XVP levels are used to generate the energy.

    Usage:

    from openqdc.datasets import NablaDFT\ndataset = NablaDFT()\n

    References

    https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D

    https://github.com/AIRI-Institute/nablaDFT

    Source code in openqdc/datasets/potential/nabladft.py
    class NablaDFT(BaseDataset):\n    \"\"\"\n    NablaDFT is a dataset constructed from a subset of the\n    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules\n    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of\n    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that\n    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.\n    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at\n    wB97X-D/def2-XVP levels are used to generate the energy.\n\n    Usage:\n    ```python\n    from openqdc.datasets import NablaDFT\n    dataset = NablaDFT()\n    ```\n\n    References:\n        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\\n\n        https://github.com/AIRI-Institute/nablaDFT\n    \"\"\"\n\n    __name__ = \"nabladft\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D_DEF2_SVP,\n    ]  # \"wb97x-d/def2-svp\"\n\n    energy_target_names = [\"wb97x-d/def2-svp\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"nabladft.db\": \"https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db\"}\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    @requires_package(\"nablaDFT\")\n    def read_raw_entries(self):\n        from nablaDFT.dataset import HamiltonianDatabase\n\n        label_path = p_join(self.root, \"summary.csv\")\n        df = pd.read_csv(label_path, usecols=[\"MOSES id\", \"CONFORMER id\", \"SMILES\", \"DFT TOTAL ENERGY\"])\n        labels = df.set_index(keys=[\"MOSES id\", \"CONFORMER id\"]).to_dict(\"index\")\n\n        raw_path = p_join(self.root, \"dataset_full.db\")\n        train = HamiltonianDatabase(raw_path)\n        n, c = len(train), 20\n        step_size = int(np.ceil(n / os.cpu_count()))\n\n        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)\n        samples = dm.parallelized(\n            fn, list(range(c)), n_jobs=c, progress=False, scheduler=\"threads\"\n        )  # don't use more than 1 job\n\n        return sum(samples, [])\n
    "},{"location":"API/datasets/orbnet_denali.html","title":"Orbnet Denali","text":""},{"location":"API/datasets/orbnet_denali.html#openqdc.datasets.potential.orbnet_denali.OrbnetDenali","title":"OrbnetDenali","text":"

    Bases: BaseDataset

    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of theory.

    Usage:

    from openqdc.datasets import OrbnetDenali\ndataset = OrbnetDenali()\n

    References

    https://arxiv.org/abs/2107.00299

    https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867

    Source code in openqdc/datasets/potential/orbnet_denali.py
    class OrbnetDenali(BaseDataset):\n    \"\"\"\n    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range\n    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and\n    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.\n    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer\n    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using\n    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of\n    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of\n    theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import OrbnetDenali\n    dataset = OrbnetDenali()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00299\\n\n        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867\n    \"\"\"\n\n    __name__ = \"orbnet_denali\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_DEF2_TZVP,\n        PotentialMethod.GFN1_XTB,\n    ]  # [\"wb97x-d3/def2-tzvp\", \"gfn1_xtb\"]\n    energy_target_names = [\"dft_energy\", \"xtb1_energy\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"orbnet_denali.tar.gz\": \"https://figshare.com/ndownloader/files/28672287\",\n        \"orbnet_denali_targets.tar.gz\": \"https://figshare.com/ndownloader/files/28672248\",\n    }\n\n    def read_raw_entries(self):\n        label_path = p_join(self.root, \"denali_labels.csv\")\n        df = pd.read_csv(label_path, usecols=[\"sample_id\", \"mol_id\", \"subset\", \"dft_energy\", \"xtb1_energy\"])\n        labels = {\n            mol_id: group.drop([\"mol_id\"], axis=1).drop_duplicates(\"sample_id\").set_index(\"sample_id\").to_dict(\"index\")\n            for mol_id, group in df.groupby(\"mol_id\")\n        }\n\n        fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)\n        res = dm.parallelized(fn, list(labels.items()), scheduler=\"threads\", n_jobs=-1, progress=True)\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/pcqm.html","title":"PCQM","text":""},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_B3LYP","title":"PCQM_B3LYP","text":"

    Bases: PCQM_PM6

    PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.

    Usage:

    from openqdc.datasets import PCQM_B3LYP\ndataset = PCQM_B3LYP()\n

    References

    https://arxiv.org/abs/2305.18454

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_B3LYP(PCQM_PM6):\n    \"\"\"\n    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to\n    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,\n    the electronic structure and properties are calculated using B3LIP/6-31G* method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_B3LYP\n    dataset = PCQM_B3LYP()\n    ```\n\n    References:\n        https://arxiv.org/abs/2305.18454\n    \"\"\"\n\n    __name__ = \"pubchemqc_b3lyp\"\n    __energy_methods__ = [\"b3lyp/6-31g*\"]\n    energy_target_names = [\"b3lyp\"]\n
    "},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_PM6","title":"PCQM_PM6","text":"

    Bases: BaseDataset

    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.

    Usage:

    from openqdc.datasets import PCQM_PM6\ndataset = PCQM_PM6()\n

    References

    https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_PM6(BaseDataset):\n    \"\"\"\n    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized\n    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less\n    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel\n    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also\n    computed using the PM6 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_PM6\n    dataset = PCQM_PM6()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740\n    \"\"\"\n\n    __name__ = \"pubchemqc_pm6\"\n    __energy_methods__ = [PotentialMethod.PM6]\n\n    energy_target_names = [\"pm6\"]\n\n    __force_methods__ = []\n    force_target_names = []\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"pubchemqc\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def collate_list(self, list_entries):\n        predicat = list_entries is not None and len(list_entries) > 0\n        list_entries = [x for x in list_entries if x is not None]\n        if predicat:\n            res = super().collate_list(list_entries)\n        else:\n            res = None\n        return res\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    def read_raw_entries(self):\n        arxiv_paths = glob(p_join(self.root, f\"{self.__energy_methods__[0]}\", \"*.pkl\"))\n        f = lambda x: self.collate_list(read_preprocessed_archive(x))\n        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)\n        samples = [x for x in samples if x is not None]\n        return samples\n\n    def preprocess(self, overwrite=False):\n        if overwrite or not self.is_preprocessed():\n            logger.info(\"Preprocessing data and saving it to cache.\")\n            logger.info(\n                f\"Dataset {self.__name__} data with the following units:\\n\"\n                f\"Energy: {self.energy_unit}, Distance: {self.distance_unit}, \"\n                f\"Forces: {self.force_unit if self.__force_methods__ else 'None'}\"\n            )\n            entries = self.read_raw_entries()\n            self.collate_and_save_list(entries)\n\n    def collate_and_save_list(self, list_entries):\n        n_molecules, n_atoms = 0, 0\n        for i in range(len(list_entries)):\n            list_entries[i][\"position_idx_range\"] += n_atoms\n            n_atoms += list_entries[i][\"position_idx_range\"].max()\n            n_molecules += list_entries[i][\"position_idx_range\"].shape[0]\n\n        for key in self.data_keys:\n            first = list_entries[0][key]\n            shape = (n_molecules, *first.shape[1:])\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\")\n            out = np.memmap(local_path, mode=\"w+\", dtype=first.dtype, shape=shape)\n\n            start = 0\n            for i in range(len(list_entries)):\n                x = list_entries[i].pop(key)\n                n = x.shape[0]\n                out[start : start + n] = x\n                out.flush()\n            push_remote(local_path, overwrite=True)\n\n        # save smiles and subset\n        tmp, n = dict(name=[]), len(list_entries)\n        local_path = p_join(self.preprocess_path, \"props.pkl\")\n        names = [list_entries[i].pop(\"name\") for i in range(n)]\n        f = lambda xs: [dm.to_inchikey(x) for x in xs]\n        res = dm.parallelized(f, names, n_jobs=-1, progress=False)\n        for x in res:\n            tmp[\"name\"] += x\n        for key in [\"subset\", \"n_atoms\"]:\n            tmp[key] = []\n            for i in range(n):\n                tmp[key] += list(list_entries[i].pop(key))\n        with open(local_path, \"wb\") as f:\n            pkl.dump(tmp, f)\n        push_remote(local_path, overwrite=True)\n
    "},{"location":"API/datasets/proteinfragments.html","title":"Protein Fragments","text":""},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.MDDataset","title":"MDDataset","text":"

    Bases: ProteinFragments

    MDDataset is a subset of the proteinfragments dataset that generated from the molecular dynamics with their model. The sampling was done with Molecular Dynamics at room temperature 300K in various solvent phase:

    Subsets

    Polyalanine: All the polyalanine are sampled in gas phase. AceAla15Lys is a polyalanine peptides capped with an N-terminal acetyl group and a protonated lysine residue at the C-terminus, Acela15nme is polyalanine peptide capped with an N-terminal acetyl group and a C-terminal N-methyl amide group

    Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)

    Usage:

    from openqdc.datasets import MDDataset\ndataset = MDDataset()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class MDDataset(ProteinFragments):\n    \"\"\"\n    MDDataset is a subset of the proteinfragments dataset that\n    generated from the molecular dynamics with their model.\n    The sampling was done with Molecular Dynamics\n    at room temperature 300K in various solvent phase:\n\n    Subsets:\n        Polyalanine:\n            All the polyalanine are sampled in gas phase. AceAla15Lys is\n            a polyalanine peptides capped with an N-terminal acetyl group\n            and a protonated lysine residue at the C-terminus,\n            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group\n            and a C-terminal N-methyl amide group\\n\n        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)\n\n    Usage:\n    ```python\n    from openqdc.datasets import MDDataset\n    dataset = MDDataset()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"mddataset\"\n\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"acala15nme_folding_clusters\", \"crambin\", \"minimahopping_acala15lysh\", \"minimahopping_acala15nme\"]\n    }\n
    "},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.ProteinFragments","title":"ProteinFragments","text":"

    Bases: BaseDataset

    ProteinFragments is a dataset constructed from a subset of the the data was generated from a top-down and bottom-up approach:

    Top-down

    Fragments are generated by cutting out a spherical region around an atom (including solvent molecules) and saturating all dangling bonds. Sampling was done with the Molecular Dynamics (MD) method from conventional FF at room temperature.

    Bottom-up

    Fragments are generated by constructing chemical graphs of one to eight nonhydrogen atoms. Sampling of multiple conformers per fragments was done with MD simulations at high temperatures or normal mode sampling.

    Usage:

    from openqdc.datasets import ProteinFragments\ndataset = ProteinFragments()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class ProteinFragments(BaseDataset):\n    \"\"\"\n    ProteinFragments is a dataset constructed from a subset of the\n    the data was generated from a top-down and bottom-up approach:\n\n    Top-down:\n        Fragments are generated by cutting out a spherical\n        region around an atom (including solvent molecules)\n        and saturating all dangling bonds.\n        Sampling was done with the Molecular Dynamics (MD) method from\n        conventional FF at room temperature.\n\n    Bottom-up:\n        Fragments are generated by constructing chemical graphs\n        of one to eight nonhydrogen atoms.\n        Sampling of multiple conformers per fragments was done with\n        MD simulations at high temperatures or normal mode sampling.\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import ProteinFragments\n    dataset = ProteinFragments()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"proteinfragments\"\n    # PBE0/def2-TZVPP+MBD\n    __energy_methods__ = [\n        PotentialMethod.PBE0_MBD_DEF2_TZVPP,\n    ]\n\n    energy_target_names = [\n        \"PBE0+MBD/def2-TZVPP\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"general_protein_fragments\"]\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"proteinfragments\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"proteinfragments\", links=self.__links__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.extend(read_db(raw_path))\n        return samples\n
    "},{"location":"API/datasets/qm1b.html","title":"QM1B","text":""},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B","title":"QM1B","text":"

    Bases: BaseDataset

    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.

    Usage:

    from openqdc.datasets import QM1B\ndataset = QM1B()\n

    References

    https://arxiv.org/pdf/2311.01135

    https://github.com/graphcore-research/qm1b-dataset/

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B(BaseDataset):\n    \"\"\"\n    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom\n    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are\n    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.\n    Electronic properties for each conformation are then calculated using the density functional B3LYP\n    and the basis set STO-3G.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B\n    dataset = QM1B()\n    ```\n\n    References:\n        https://arxiv.org/pdf/2311.01135\\n\n        https://github.com/graphcore-research/qm1b-dataset/\n    \"\"\"\n\n    __name__ = \"qm1b\"\n\n    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]\n    __force_methods__ = []\n\n    energy_target_names = [\"b3lyp/sto-3g\"]\n    force_target_names = []\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"ev/bohr\"\n    __links__ = {\n        \"qm1b_validation.parquet\": \"https://ndownloader.figshare.com/files/43005175\",\n        **{f\"part_{i:03d}.parquet\": f\"https://ndownloader.figshare.com/files/{FILE_NUM[i]}\" for i in range(0, 256)},\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qm1b\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        filenames = list(map(lambda x: p_join(self.root, f\"part_{x:03d}.parquet\"), list(range(0, 256)))) + [\n            p_join(self.root, \"qm1b_validation.parquet\")\n        ]\n\n        def read_entries_parallel(filename):\n            df = pd.read_parquet(filename)\n\n            def extract_parallel(df, i):\n                return extract_from_row(df.iloc[i])\n\n            fn = partial(extract_parallel, df)\n            list_of_idxs = list(range(len(df)))\n            results = dm.utils.parallelized(fn, list_of_idxs, scheduler=\"threads\", progress=False)\n            return results\n\n        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler=\"processes\", progress=True)\n\n        return [x for xs in list_of_list for x in xs]\n
    "},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B_SMALL","title":"QM1B_SMALL","text":"

    Bases: QM1B

    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.

    Usage:

    from openqdc.datasets import QM1B_SMALL\ndataset = QM1B_SMALL()\n

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B_SMALL(QM1B):\n    \"\"\"\n    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B_SMALL\n    dataset = QM1B_SMALL()\n    ```\n    \"\"\"\n\n    __name__ = \"qm1b_small\"\n
    "},{"location":"API/datasets/qm7x.html","title":"QM7X","text":""},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X","title":"QM7X","text":"

    Bases: BaseDataset

    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.

    Usage:

    from openqdc.datasets import QM7X\ndataset = QM7X()\n

    References

    https://arxiv.org/abs/2006.15139

    https://zenodo.org/records/4288677

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X(BaseDataset):\n    \"\"\"\n    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with\n    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,\n    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-\n    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure\n    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)\n    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non\n    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of\n    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has\n    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X\n    dataset = QM7X()\n    ```\n\n    References:\n        https://arxiv.org/abs/2006.15139\\n\n        https://zenodo.org/records/4288677\n    \"\"\"\n\n    __name__ = \"qm7x\"\n\n    __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # \"pbe0/def2-tzvp\", \"dft3b\"]\n\n    energy_target_names = [\"ePBE0+MBD\", \"eDFTB+MBD\"]\n\n    __force_mask__ = [True, False]\n\n    force_target_names = [\"pbe0FOR\"]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {f\"{i}000.xz\": f\"https://zenodo.org/record/4288677/files/{i}000.xz\" for i in range(1, 9)}\n\n    def read_raw_entries(self):\n        samples = []\n        for i in range(1, 9):\n            raw_path = p_join(self.root, f\"{i}000\")\n            data = load_hdf5_file(raw_path)\n            samples += [\n                read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys())\n            ]\n\n        return samples\n
    "},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X_V2","title":"QM7X_V2","text":"

    Bases: QM7X

    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QM7X_V2\ndataset = QM7X_V2()\n

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X_V2(QM7X):\n    \"\"\"\n    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X_V2\n    dataset = QM7X_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qm7x_v2\"\n    __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]\n    __force_mask__ = QM7X.__force_mask__ + [False]\n    energy_target_names = QM7X.energy_target_names + [\"PM6\"]\n    force_target_names = QM7X.force_target_names\n
    "},{"location":"API/datasets/qmugs.html","title":"Qmugs","text":""},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs","title":"QMugs","text":"

    Bases: BaseDataset

    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).

    Usage:

    from openqdc.datasets import QMugs\ndataset = QMugs()\n

    References

    https://arxiv.org/abs/2107.00367

    https://www.nature.com/articles/s41597-022-01390-7#ethics

    https://www.research-collection.ethz.ch/handle/20.500.11850/482129

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs(BaseDataset):\n    \"\"\"\n    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules\n    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB\n    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical\n    method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs\n    dataset = QMugs()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00367\\n\n        https://www.nature.com/articles/s41597-022-01390-7#ethics\\n\n        https://www.research-collection.ethz.ch/handle/20.500.11850/482129\n    \"\"\"\n\n    __name__ = \"qmugs\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP]  # \"gfn2_xtb\", \"wb97x-d/def2-svp\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"summary.csv\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv\",\n        \"structures.tar.gz\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz\",  # noqa\n    }\n\n    energy_target_names = [\n        \"GFN2:TOTAL_ENERGY\",\n        \"DFT:TOTAL_ENERGY\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"structures\")\n        mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]\n\n        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler=\"threads\")\n        return samples\n
    "},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs_V2","title":"QMugs_V2","text":"

    Bases: QMugs

    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QMugs_V2\ndataset = QMugs_V2()\n

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs_V2(QMugs):\n    \"\"\"\n    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs_V2\n    dataset = QMugs_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qmugs_v2\"\n    __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]\n    energy_target_names = QMugs.energy_target_names + [\"PM6\"]\n    __force_mask__ = QMugs.__force_mask__ + [False]\n
    "},{"location":"API/datasets/qmx.html","title":"QMX","text":""},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7","title":"QM7","text":"

    Bases: QMX

    QM7 is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, H]

    Usage:

    from openqdc.datasets import QM7\ndataset = QM7()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7(QMX):\n    \"\"\"\n    QM7 is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7\n    dataset = QM7()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7.hdf5.gz\": \"https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1\"}\n    __name__ = \"qm7\"\n\n    energy_target_names = [\n        \"B2PLYP-D3(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3(BJ):def2-svp\",\n        \"B2PLYP-D3(BJ):def2-tzvp\",\n        \"B2PLYP-D3(BJ):sto-3g\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"MP2:aug-cc-pvdz\",\n        \"MP2:aug-cc-pvtz\",\n        \"MP2:def2-svp\",\n        \"MP2:def2-tzvp\",\n        \"MP2:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7b","title":"QM7b","text":"

    Bases: QMX

    QM7b is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, Cl, H]

    Usage:

    from openqdc.datasets import QM7b\ndataset = QM7b()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7b(QMX):\n    \"\"\"\n    QM7b is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, Cl, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7b\n    dataset = QM7b()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7b.hdf5.gz\": \"https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1\"}\n    __name__ = \"qm7b\"\n    energy_target_names = [\n        \"CCSD(T0):cc-pVDZ\",\n        \"HF:cc-pVDZ\",\n        \"HF:cc-pVTZ\",\n        \"MP2:cc-pVTZ\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM8","title":"QM8","text":"

    Bases: QMX

    QM8 is the subset of QM9 used in a study on modeling quantum mechanical calculations of electronic spectra and excited state energy (a increase of energy from the ground states) of small molecules up to eight heavy atoms. Multiple methods were used, including time-dependent density functional theories (TDDFT) and second-order approximate coupled-cluster (CC2). The molecules conformations are relaxed geometries computed using the DFT B3LYP with basis set 6-31G(2df,p). For more information about the sampling, check QM9 dataset.

    Usage:

    from openqdc.datasets import QM8\ndataset = QM8()\n

    References

    https://arxiv.org/pdf/1504.01966

    Source code in openqdc/datasets/potential/qmx.py
    class QM8(QMX):\n    \"\"\"QM8 is the subset of QM9 used in a study on modeling quantum\n    mechanical calculations of electronic spectra and excited\n    state energy (a increase of energy from the ground states) of small molecules\n    up to eight heavy atoms.\n    Multiple methods were used, including\n    time-dependent density functional theories (TDDFT) and\n    second-order approximate coupled-cluster (CC2).\n    The molecules conformations are relaxed geometries computed using\n    the DFT B3LYP with basis set 6-31G(2df,p).\n    For more information about the sampling, check QM9 dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM8\n    dataset = QM8()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1504.01966\n    \"\"\"\n\n    __name__ = \"qm8\"\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n\n    __links__ = {\n        \"qm8.csv\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv\",\n        \"qm8.tar.gz\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz\",\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"qm8.csv\"))\n        mols = dm.read_sdf(p_join(self.root, \"qm8.sdf\"), sanitize=False, remove_hs=False)\n        samples = []\n        for idx_row, mol in zip(df.iterrows(), mols):\n            _, row = idx_row\n            positions = mol.GetConformer().GetPositions()\n            x = get_atomic_number_and_charge(mol)\n            n_atoms = positions.shape[0]\n            samples.append(\n                dict(\n                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),\n                    name=np.array([row[\"smiles\"]]),\n                    energies=np.array(\n                        [\n                            row[\n                                [\"E1-CC2\", \"E2-CC2\", \"E1-PBE0\", \"E2-PBE0\", \"E1-PBE0.1\", \"E2-PBE0.1\", \"E1-CAM\", \"E2-CAM\"]\n                            ].tolist()\n                        ],\n                        dtype=np.float64,\n                    ).reshape(1, -1),\n                    n_atoms=np.array([n_atoms], dtype=np.int32),\n                    subset=np.array([f\"{self.__name__}\"]),\n                )\n            )\n        return samples\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM9","title":"QM9","text":"

    Bases: QMX

    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed by relaxing geometries with quantum mechanical method B3LYP.

    Usage:

    from openqdc.datasets import QM9\ndataset = QM9()\n

    Reference

    https://www.nature.com/articles/sdata201422

    Source code in openqdc/datasets/potential/qmx.py
    class QM9(QMX):\n    \"\"\"\n    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,\n    containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)\n    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed\n    by relaxing geometries with quantum mechanical method B3LYP.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM9\n    dataset = QM9()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/sdata201422\n    \"\"\"\n\n    __links__ = {\"qm9.hdf5.gz\": \"https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1\"}\n    __name__ = \"qm9\"\n    energy_target_names = [\n        \"Internal energy at 0 K\",\n        \"B3LYP:def2-svp\",\n        \"HF:cc-pvtz\",\n        \"HF:sto-3g\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n    ]\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QMX","title":"QMX","text":"

    Bases: ABC, BaseDataset

    QMX dataset base abstract class

    Source code in openqdc/datasets/potential/qmx.py
    class QMX(ABC, BaseDataset):\n    \"\"\"\n    QMX dataset base abstract class\n    \"\"\"\n\n    __name__ = \"qm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qmx\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"qmx\", links=self.__links__)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)\n        return samples\n
    "},{"location":"API/datasets/revmd17.html","title":"RevMD17","text":""},{"location":"API/datasets/revmd17.html#openqdc.datasets.potential.revmd17.RevMD17","title":"RevMD17","text":"

    Bases: BaseDataset

    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration grid. The dataset contains the following molecules: Benzene: 627000 samples

    Uracil: 133000 samples\n\nNaptalene: 326000 samples\n\nAspirin: 211000 samples\n\nSalicylic Acid: 320000 samples\n\nMalonaldehyde: 993000 samples\n\nEthanol: 555000 samples\n\nToluene: 100000 samples\n

    Usage:

    from openqdc.datasets import RevMD17\ndataset = RevMD17()\n

    References

    https://arxiv.org/abs/2007.09593

    Source code in openqdc/datasets/potential/revmd17.py
    class RevMD17(BaseDataset):\n    \"\"\"\n    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original\n    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies\n    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration\n    grid. The dataset contains the following molecules:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import RevMD17\n    dataset = RevMD17()\n    ```\n\n    References:\n        https://arxiv.org/abs/2007.09593\n    \"\"\"\n\n    __name__ = \"revmd17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP\n        # \"pbe/def2-tzvp\",\n    ]\n    __force_mask__ = [True]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_methods__ = [\n        \"pbe/def2-tzvp\",\n    ]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n    __links__ = {\"revmd17.zip\": \"https://figshare.com/ndownloader/articles/12672038/versions/3\"}\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    def read_raw_entries(self):\n        entries_list = []\n        decompress_tar_gz(p_join(self.root, \"rmd17.tar.bz2\"))\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/sn2_rxn.html","title":"SN2 RXN","text":""},{"location":"API/datasets/sn2_rxn.html#openqdc.datasets.potential.sn2_rxn.SN2RXN","title":"SN2RXN","text":"

    Bases: BaseDataset

    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset contains 452,709 structures along with the energy, force and dipole moments.

    Usage:

    from openqdc.datasets import SN2RXN\ndataset = SN2RXN()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605341

    Source code in openqdc/datasets/potential/sn2_rxn.py
    class SN2RXN(BaseDataset):\n    \"\"\"\n    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X +  Y-, and\n    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by\n    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment\n    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and\n    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset\n    contains 452,709 structures along with the energy, force and dipole moments.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SN2RXN\n    dataset = SN2RXN()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605341\n    \"\"\"\n\n    __name__ = \"sn2_rxn\"\n\n    __energy_methods__ = [\n        PotentialMethod.DSD_BLYP_D3_BJ_DEF2_TZVP\n        # \"dsd-blyp-d3(bj)/def2-tzvp\",\n    ]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"sn2_rxn.npz\": \"https://zenodo.org/records/2605341/files/sn2_reactions.npz\"}\n\n    energy_target_names = [\n        # TODO: We need to revalidate this to make sure that is not atomization energies.\n        \"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"DSD-BLYP-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"sn2_rxn.npz\")\n        data = np.load(raw_path)\n        samples = extract_npz_entry(data)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html","title":"Solvated Peptides","text":""},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides","title":"SolvatedPeptides","text":"

    Bases: BaseDataset

    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\" and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.

    Usage:

    from openqdc.datasets import SolvatedPeptides\ndataset = SolvatedPeptides()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605372

    Source code in openqdc/datasets/potential/solvated_peptides.py
    class SolvatedPeptides(BaseDataset):\n    \"\"\"\n    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\"\n    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are\n    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10\n    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SolvatedPeptides\n    dataset = SolvatedPeptides()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605372\n    \"\"\"\n\n    __name__ = \"solvated_peptides\"\n\n    __energy_methods__ = [\n        PotentialMethod.REVPBE_D3_BJ_DEF2_TZVP\n        # \"revpbe-d3(bj)/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    # TO CHECK\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"solvated_peptides.hdf5.gz\": \"https://zenodo.org/record/3585804/files/213.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"solvated_peptides.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"solvated_peptides\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/solvated_peptides.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/spice.html","title":"Spice","text":""},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.Spice","title":"Spice","text":"

    Bases: BaseDataset

    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the wB97M-D3(BJ)/def2-TZVPPD level of theory.

    Usage:

    from openqdc.datasets import Spice\ndataset = Spice()\n

    References

    https://arxiv.org/abs/2209.10702

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class Spice(BaseDataset):\n    \"\"\"\n    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of\n    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,\n    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate\n    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and\n    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the\n    wB97M-D3(BJ)/def2-TZVPPD level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Spice\n    dataset = Spice()\n    ```\n\n    References:\n        https://arxiv.org/abs/2209.10702\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice\"\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n\n    energy_target_names = [\"dft_total_energy\"]\n\n    force_target_names = [\"dft_total_gradient\"]\n\n    subset_mapping = {\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Dipeptides Single Points Dataset v1.2\": \"Dipeptides\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.0\": \"DES370K Dimers\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE Ion Pairs Single Points Dataset v1.1\": \"Ion Pairs\",\n    }\n    __links__ = {\"SPICE-1.1.4.hdf5\": \"https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5\"}\n\n    def convert_forces(self, x):\n        return (-1.0) * super().convert_forces(x)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"SPICE-1.1.4.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        tmp = [read_record(data[mol_name], self) for mol_name in tqdm(data)]  # don't use parallelized here\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceV2","title":"SpiceV2","text":"

    Bases: Spice

    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.

    Usage:

    from openqdc.datasets import SpiceV2\ndataset = SpiceV2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceV2(Spice):\n    \"\"\"\n    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.\n    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain\n    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve\n    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and\n    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceV2\n    dataset = SpiceV2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spicev2\"\n\n    subset_mapping = {\n        \"SPICE Dipeptides Single Points Dataset v1.3\": \"Dipeptides\",\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Water Clusters v1.0\": \"Water Clusters\",\n        \"SPICE Solvated PubChem Set 1 v1.0\": \"Solvated PubChem\",\n        \"SPICE Amino Acid Ligand v1.0\": \"Amino Acid Ligand\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 7 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 8 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 9 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 10 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.1\": \"DES370K Dimers\",\n        \"SPICE PubChem Boron Silicon v1.0\": \"PubChem Boron Silicon\",\n        \"SPICE Ion Pairs Single Points Dataset v1.2\": \"Ion Pairs\",\n    }\n    __links__ = {\"spice-2.0.0.hdf5\": \"https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"spice-2.0.0.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        # Entry 40132 without positions, skip it\n        # don't use parallelized here\n        tmp = [read_record(data[mol_name], self) for i, mol_name in enumerate(tqdm(data)) if i != 40132]\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceVL2","title":"SpiceVL2","text":"

    Bases: SpiceV2

    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.

    Usage:

    from openqdc.datasets import SpiceVL2\ndataset = SpiceVL2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceVL2(SpiceV2):\n    \"\"\"\n    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceVL2\n    dataset = SpiceVL2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice_vl2\"\n\n    __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]\n    energy_target_names = SpiceV2.energy_target_names + [\"GFN2,\" \"PM6\"]\n    __force_mask__ = SpiceV2.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.read_record","title":"read_record(r, obj)","text":"

    Read record from hdf5 file. r : hdf5 record obj : Spice class object used to grab subset and names

    Source code in openqdc/datasets/potential/spice.py
    def read_record(r, obj):\n    \"\"\"\n    Read record from hdf5 file.\n        r : hdf5 record\n        obj : Spice class object used to grab subset and names\n    \"\"\"\n    smiles = r[\"smiles\"].asstr()[0]\n    subset = r[\"subset\"][0].decode(\"utf-8\")\n    n_confs = r[\"conformations\"].shape[0]\n    x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))\n    positions = r[\"conformations\"][:]\n\n    res = dict(\n        name=np.array([smiles] * n_confs),\n        subset=np.array([obj.subset_mapping[subset]] * n_confs),\n        energies=r[obj.energy_target_names[0]][:][:, None].astype(np.float64),\n        forces=r[obj.force_target_names[0]][:].reshape(\n            -1, 3, 1\n        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method\n        atomic_inputs=np.concatenate(\n            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32\n        ).reshape(-1, 5),\n        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/splinter.html","title":"Splinter","text":""},{"location":"API/datasets/splinter.html#openqdc.datasets.interaction.splinter.Splinter","title":"Splinter","text":"

    Bases: BaseInteractionDataset

    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.

    Usage:

    from openqdc.datasets import Splinter\ndataset = Splinter()\n

    Reference

    https://doi.org/10.1038/s41597-023-02443-1

    Source code in openqdc/datasets/interaction/splinter.py
    class Splinter(BaseInteractionDataset):\n    \"\"\"\n    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated\n    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies\n    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Splinter\n    dataset = Splinter()\n    ```\n\n    Reference:\n        https://doi.org/10.1038/s41597-023-02443-1\n    \"\"\"\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __name__ = \"splinter\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        # \"sapt0/jun-cc-pV(D+d)Z_unscaled\", #TODO: we need to pick the unscaled version only here\n        # \"sapt0/jun-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_scaled\",\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = []\n    __links__ = {\n        \"dimerpairs.0.tar.gz\": \"https://figshare.com/ndownloader/files/39449167\",\n        \"dimerpairs.1.tar.gz\": \"https://figshare.com/ndownloader/files/40271983\",\n        \"dimerpairs.2.tar.gz\": \"https://figshare.com/ndownloader/files/40271989\",\n        \"dimerpairs.3.tar.gz\": \"https://figshare.com/ndownloader/files/40272001\",\n        \"dimerpairs.4.tar.gz\": \"https://figshare.com/ndownloader/files/40272022\",\n        \"dimerpairs.5.tar.gz\": \"https://figshare.com/ndownloader/files/40552931\",\n        \"dimerpairs.6.tar.gz\": \"https://figshare.com/ndownloader/files/40272040\",\n        \"dimerpairs.7.tar.gz\": \"https://figshare.com/ndownloader/files/40272052\",\n        \"dimerpairs.8.tar.gz\": \"https://figshare.com/ndownloader/files/40272061\",\n        \"dimerpairs.9.tar.gz\": \"https://figshare.com/ndownloader/files/40272064\",\n        \"dimerpairs_nonstandard.tar.gz\": \"https://figshare.com/ndownloader/files/40272067\",\n        \"lig_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272070\",\n        \"lig_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272073\",\n        \"prot_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272076\",\n        \"prot_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272079\",\n        \"merge_monomers.py\": \"https://figshare.com/ndownloader/files/41807682\",\n    }\n\n    def read_raw_entries(self) -> List[Dict]:\n        logger.info(f\"Reading Splinter interaction data from {self.root}\")\n        data = []\n        i = 0\n        with tqdm(total=1680022) as progress_bar:\n            for root, dirs, files in os.walk(self.root):  # total is currently an approximation\n                for filename in files:\n                    if not filename.endswith(\".xyz\"):\n                        continue\n                    i += 1\n                    filepath = os.path.join(root, filename)\n                    filein = open(filepath, \"r\")\n                    lines = list(map(lambda x: x.strip(), filein.readlines()))\n                    n_atoms = np.array([int(lines[0])], dtype=np.int32)\n                    metadata = lines[1].split(\",\")\n                    try:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            r,\n                            theta_P,\n                            tau_P,\n                            theta_L,\n                            tau_L,\n                            tau_PL,\n                        ) = metadata[0].split(\"_\")\n                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(\n                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])\n                        )\n                    except ValueError:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            _,\n                        ) = metadata[0].split(\"_\")\n                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6\n                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)\n                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)\n                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))\n                    lines = list(map(lambda x: x.split(), lines[2:]))\n                    pos = np.array(lines)[:, 1:].astype(np.float32)\n                    elems = np.array(lines)[:, 0]\n                    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)\n                    natoms0 = n_atoms_ptr[0]\n                    natoms1 = n_atoms[0] - natoms0\n                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)\n                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)\n                    subset = np.array([root.split(\"/\")[-1]])\n\n                    item = dict(\n                        energies=energies,\n                        subset=subset,\n                        n_atoms=n_atoms,\n                        n_atoms_ptr=n_atoms_ptr,\n                        atomic_inputs=atomic_inputs,\n                        protein_monomer_name=np.array([protein_monomer_name]),\n                        protein_interaction_site_type=np.array([protein_interaction_site_type]),\n                        ligand_monomer_name=np.array([ligand_monomer_name]),\n                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),\n                        index=np.array([index], dtype=np.float32),\n                        r=np.array([r], dtype=np.float32),\n                        theta_P=np.array([theta_P], dtype=np.float32),\n                        tau_P=np.array([tau_P], dtype=np.float32),\n                        theta_L=np.array([theta_L], dtype=np.float32),\n                        tau_L=np.array([tau_L], dtype=np.float32),\n                        tau_PL=np.array([tau_PL], dtype=np.float32),\n                        name=np.array([protein_monomer_name + \".\" + ligand_monomer_name]),\n                    )\n                    data.append(item)\n                    progress_bar.update(1)\n        logger.info(f\"Processed {i} files in total\")\n        return data\n
    "},{"location":"API/datasets/tmqm.html","title":"TMQM","text":""},{"location":"API/datasets/tmqm.html#openqdc.datasets.potential.tmqm.TMQM","title":"TMQM","text":"

    Bases: BaseDataset

    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database and then optimized in gas phase with the extended tight-binding GFN2-xTB method.

    Usage:

    from openqdc.datasets import TMQM\ndataset = TMQM()\n

    References

    https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041

    https://github.com/bbskjelstad/tmqm

    Source code in openqdc/datasets/potential/tmqm.py
    class TMQM(BaseDataset):\n    \"\"\"\n    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of\n    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated\n    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database\n    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import TMQM\n    dataset = TMQM()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\\n\n        https://github.com/bbskjelstad/tmqm\n    \"\"\"\n\n    __name__ = \"tmqm\"\n\n    __energy_methods__ = [PotentialMethod.TPSSH_DEF2_TZVP]  # \"tpssh/def2-tzvp\"]\n\n    energy_target_names = [\"TPSSh/def2TZVP level\"]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        x: f\"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}\"\n        for x in [\"tmQM_X1.xyz.gz\", \"tmQM_X2.xyz.gz\", \"tmQM_y.csv\", \"Benchmark2_TPSSh_Opt.xyz\"]\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"tmQM_y.csv\"), sep=\";\", usecols=[\"CSD_code\", \"Electronic_E\"])\n        e_map = dict(zip(df[\"CSD_code\"], df[\"Electronic_E\"]))\n        raw_fnames = [\"tmQM_X1.xyz\", \"tmQM_X2.xyz\", \"Benchmark2_TPSSh_Opt.xyz\"]\n        samples = []\n        for fname in raw_fnames:\n            data = read_xyz(p_join(self.root, fname), e_map)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/transition1x.html","title":"Transition1X","text":""},{"location":"API/datasets/transition1x.html#openqdc.datasets.potential.transition1x.Transition1X","title":"Transition1X","text":"

    Bases: BaseDataset

    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and the transition states are generated by running Nudged Elastic Band (NEB) with DFT.

    Usage:

    from openqdc.datasets import Transition1X\ndataset = Transition1X()\n

    References: - https://www.nature.com/articles/s41597-022-01870-w

    • https://gitlab.com/matschreiner/Transition1x
    Source code in openqdc/datasets/potential/transition1x.py
    class Transition1X(BaseDataset):\n    \"\"\"\n    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy\n    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and\n    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Transition1X\n    dataset = Transition1X()\n    ```\n\n    References:\n    - https://www.nature.com/articles/s41597-022-01870-w\\n\n    - https://gitlab.com/matschreiner/Transition1x\\n\n    \"\"\"\n\n    __name__ = \"transition1x\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D\n        # \"wb97x/6-31G(d)\",\n    ]\n\n    energy_target_names = [\n        \"wB97x_6-31G(d).energy\",\n    ]\n\n    __force_mask__ = [True]\n    force_target_names = [\n        \"wB97x_6-31G(d).forces\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"Transition1x.h5\": \"https://figshare.com/ndownloader/files/36035789\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"Transition1x.h5\")\n        f = load_hdf5_file(raw_path)[\"data\"]\n\n        res = sum([read_record(f[g], group=g) for g in tqdm(f)], [])  # don't use parallelized here\n        return res\n
    "},{"location":"API/datasets/vqm24.html","title":"VQM24","text":""},{"location":"API/datasets/vqm24.html#openqdc.datasets.potential.vqm24.VQM24","title":"VQM24","text":"

    Bases: BaseDataset

    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.

    Usage:

    from openqdc.datasets import VQM24\ndataset = VQM24()\n

    Reference

    https://arxiv.org/abs/2405.05961

    Source code in openqdc/datasets/potential/vqm24.py
    class VQM24(BaseDataset):\n    \"\"\"\n    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical\n    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional\n    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and\n    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.\n\n    Usage:\n    ```python\n    from openqdc.datasets import VQM24\n    dataset = VQM24()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2405.05961\n    \"\"\"\n\n    __name__ = \"vqm24\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_CC_PVDZ,  # \"wB97x-D3/cc-pVDZ.\"\n    ]\n\n    energy_target_names = [\n        \"wB97x-D3/cc-pVDZ\",\n    ]\n    # \u03c9B97X-D3/cc-pVDZ\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        f\"{name}.npz\": f\"https://zenodo.org/records/11164951/files/{name}.npz?download=1\"\n        for name in [\"DFT_all\", \"DFT_saddles\", \"DFT_uniques\", \"DMC\"]\n    }\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.append(read_npz_entry(raw_path))\n        return samples\n
    "},{"location":"API/datasets/waterclusters.html","title":"SCAN Waterclusters","text":""},{"location":"API/datasets/waterclusters.html#openqdc.datasets.potential.waterclusters.SCANWaterClusters","title":"SCANWaterClusters","text":"

    Bases: BaseDataset

    The SCAN Water Clusters dataset contains conformations of neutral water clusters containing up to 20 monomers, charged water clusters, and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212. Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics simulations using AMBER 9 and optimized to obtain lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.

    Chemical Species

    [H, O, Li, Na, K, F, Cl, Br]

    Usage:

    from openqdc.datasets import SCANWaterClusters\ndataset = SCANWaterClusters()\n

    References

    https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec

    https://github.com/esoteric-ephemera/water_cluster_density_errors

    Source code in openqdc/datasets/potential/waterclusters.py
    class SCANWaterClusters(BaseDataset):\n    \"\"\"\n    The SCAN Water Clusters dataset contains conformations of\n    neutral water clusters containing up to 20 monomers, charged water clusters,\n    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:\n    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14\n    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of\n    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212.\n    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics\n    simulations using AMBER 9 and optimized to obtain\n    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.\n\n\n    Chemical Species:\n        [H, O, Li, Na, K, F, Cl, Br]\n\n    Usage:\n    ```python\n    from openqdc.datasets import SCANWaterClusters\n    dataset = SCANWaterClusters()\n    ```\n\n    References:\n        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\\n\n        https://github.com/esoteric-ephemera/water_cluster_density_errors\n    \"\"\"\n\n    __name__ = \"scanwaterclusters\"\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    energy_target_names = [\n        \"HF\",\n        \"HF-r2SCAN-DC4\",\n        \"SCAN\",\n        \"SCAN@HF\",\n        \"SCAN@r2SCAN50\",\n        \"r2SCAN\",\n        \"r2SCAN@HF\",\n        \"r2SCAN@r2SCAN50\",\n        \"r2SCAN50\",\n        \"r2SCAN100\",\n        \"r2SCAN10\",\n        \"r2SCAN20\",\n        \"r2SCAN25\",\n        \"r2SCAN30\",\n        \"r2SCAN40\",\n        \"r2SCAN60\",\n        \"r2SCAN70\",\n        \"r2SCAN80\",\n        \"r2SCAN90\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]\n    force_target_names = []\n    # 27            # 9 level\n    subsets = [\"BEGDB_H2O\", \"WATER27\", \"H2O_alkali_clusters\", \"H2O_halide_clusters\"]\n    __links__ = {\n        \"geometries.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True\",  # noqa\n        \"total_energies.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True\",  # noqa\n    }\n\n    def read_raw_entries(self):\n        entries = []  # noqa\n        for i, subset in enumerate(self.subsets):\n            geometries = read_geometries(p_join(self.root, \"geometries.json.gz\"), subset)\n            energies = read_energies(p_join(self.root, \"total_energies.json.gz\"), subset)\n            datum = {}\n            for k in energies:\n                _ = energies[k].pop(\"metadata\")\n                datum[k] = energies[k][\"total_energies\"]\n            entries.extend(format_geometry_and_entries(geometries, datum, subset))\n        return entries\n
    "},{"location":"API/datasets/waterclusters3_30.html","title":"Waterclusters3_30","text":""},{"location":"API/datasets/waterclusters3_30.html#openqdc.datasets.potential.waterclusters3_30.WaterClusters","title":"WaterClusters","text":"

    Bases: BaseDataset

    The WaterClusters dataset contains putative minima and low energy networks for water clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.

    Chemical Species

    [\"H\", \"O\"]

    Usage:

    from openqdc.datasets import WaterClusters\ndataset = WaterClusters()\n

    References

    https://doi.org/10.1063/1.5128378

    https://sites.uw.edu/wdbase/database-of-water-clusters/

    Source code in openqdc/datasets/potential/waterclusters3_30.py
    class WaterClusters(BaseDataset):\n    \"\"\"\n    The WaterClusters dataset contains putative minima and low energy networks for water\n    clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with\n    the TTM2.1-F ab-initio based interaction potential for water.\n    It contains approximately 4.5 mil. structures.\n    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.\n\n    Chemical Species:\n        [\"H\", \"O\"]\n\n    Usage:\n    ```python\n    from openqdc.datasets import WaterClusters\n    dataset = WaterClusters()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5128378\\n\n        https://sites.uw.edu/wdbase/database-of-water-clusters/\\n\n    \"\"\"\n\n    __name__ = \"waterclusters3_30\"\n\n    # Energy in hartree, all zeros by default\n    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [PotentialMethod.TTM2_1_F]  # \"ttm2.1-f\"\n    energy_target_names = [\"TTM2.1-F Potential\"]\n    __links__ = {\"W3-W30_all_geoms_TTM2.1-F.zip\": \"https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n\"}\n\n    def read_raw_entries(self):\n        samples = []\n        parent_folder = p_join(self.root, \"W3-W30_all_geoms_TTM2.1-F/\")\n        for i in range(3, 31):\n            name = f\"W{i}_geoms_all\"\n            zip_path = p_join(parent_folder, f\"{name}.zip\")\n            xyz_path = p_join(parent_folder, f\"{name}.xyz\")\n            with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n                zip_ref.extractall(parent_folder)\n\n            data = read_xyz(xyz_path, i)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/x40.html","title":"X40","text":""},{"location":"API/datasets/x40.html#openqdc.datasets.interaction.x40.X40","title":"X40","text":"

    Bases: YamlDataset

    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules where the halogens participate in various interaction types such as electrostatic interactions, london dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are computed with CCSD(T)/CBS level of theory.

    Usage:

    from openqdc.datasets import X40\ndataset = X40()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct300647k

    Source code in openqdc/datasets/interaction/x40.py
    class X40(YamlDataset):\n    \"\"\"\n    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules\n    where the halogens participate in various interaction types such as electrostatic interactions, london\n    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic\n    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries\n    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are\n    computed with CCSD(T)/CBS level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import X40\n    dataset = X40()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct300647k\n    \"\"\"\n\n    __name__ = \"x40\"\n    __energy_methods__ = [\n        InteractionMethod.CCSD_T_CBS,  # \"CCSD(T)/CBS\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.DCCSDT_HA_DZ,  # \"dCCSD(T)/haDZ\",\n        InteractionMethod.DCCSDT_HA_TZ,  # \"dCCSD(T)/haTZ\",\n        InteractionMethod.MP2_5_CBS_ADZ,  # \"MP2.5/CBS(aDZ)\",\n    ]\n    __links__ = {\n        \"x40.yaml\": \"http://cuby4.molecular.cz/download_datasets/x40.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/X40.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.shortname\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        xyz_path = p_join(root, f\"{filename}.xyz\")\n        with open(xyz_path, \"r\") as xyz_file:  # avoid not closing the file\n            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))\n            setup = lines.pop(1)\n            n_atoms_first = setup[0].split(\"-\")[1]\n            n_atoms_ptr = np.array([int(n_atoms_first)], dtype=np.int32)\n            return n_atoms_ptr\n
    "},{"location":"tutorials/usage.html","title":"OpenQDC Hands-on Tutorial","text":"In\u00a0[31]: Copied!
    from openqdc.datasets import Spice\nds = Spice(\n    energy_unit=\"kcal/mol\",\n    distance_unit=\"ang\",\n)\n
    from openqdc.datasets import Spice ds = Spice( energy_unit=\"kcal/mol\", distance_unit=\"ang\", )
    2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:381 - Reading preprocessed data.\n2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:382 - Dataset spice with the following units:\n                     Energy: hartree,\n                     Distance: bohr,\n                     Forces: hartree/bohr\n2024-02-29 12:17:13.978 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded atomic_inputs with shape (33175288, 5), dtype float32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded position_idx_range with shape (1110165, 2), dtype int32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded energies with shape (1110165, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded forces with shape (33175288, 3, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded name with shape (1110165,), dtype <U632\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded subset with shape (1110165,), dtype <U20\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded n_atoms with shape (1110165,), dtype int32\n2024-02-29 12:17:13.983 | INFO     | openqdc.datasets.base:_precompute_statistics:154 - Loaded precomputed statistics\n2024-02-29 12:17:13.985 | INFO     | openqdc.datasets.base:_convert_data:141 - Converting spice data to the following units:\n                     Energy: kcal/mol,\n                     Distance: ang,\n                     Forces: kcal/mol/ang\n
    In\u00a0[39]: Copied!
    ds[0]\n
    ds[0] Out[39]:
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n        [ 0.06135919,  2.6528177 , -0.4163168 ],\n        [ 1.762424  ,  1.0939031 , -1.4321265 ],\n        [-0.22598556,  1.6802124 ,  0.5978407 ],\n        [ 1.1740401 , -0.04154727, -0.512898  ],\n        [-0.41957757, -0.24454471,  3.0900123 ],\n        [ 0.7238282 ,  0.52511275,  0.8248042 ],\n        [ 0.05533566, -0.6713925 ,  1.6488242 ],\n        [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n        [-0.0657557 ,  1.8550861 , -2.3939755 ],\n        [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n        [-0.8098082 ,  3.201651  , -0.6507186 ],\n        [ 0.792407  ,  3.368585  ,  0.01799216],\n        [ 2.558414  ,  1.5826052 , -0.9704587 ],\n        [ 2.166226  ,  0.64460325, -2.384977  ],\n        [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n        [-1.1792994 ,  1.1978384 ,  0.34465855],\n        [ 1.8563557 , -0.90775317, -0.5115611 ],\n        [ 0.31435642, -0.42179283, -1.0628686 ],\n        [ 0.42152542,  0.25200853,  3.627957  ],\n        [-0.5416419 , -1.1152233 ,  3.7040234 ],\n        [-1.1868238 ,  0.46580845,  3.0541756 ],\n        [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n        [-0.7720179 , -0.9603249 ,  0.994841  ],\n        [ 1.7518724 , -1.5571898 ,  2.560223  ],\n        [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n        [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32),\n 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n        1, 1, 1, 1, 1], dtype=int32),\n 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0, 0, 0, 0], dtype=int32),\n 'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[40]: Copied!
    ds.get_ase_atoms(0)\n
    ds.get_ase_atoms(0) Out[40]:
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)
    In\u00a0[53]: Copied!
    ds.get_ase_atoms(0).info\n
    ds.get_ase_atoms(0).info Out[53]:
    {'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[41]: Copied!
    for i in ds.as_iter():\n    print(i)\n    break\n
    for i in ds.as_iter(): print(i) break
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n       [ 0.06135919,  2.6528177 , -0.4163168 ],\n       [ 1.762424  ,  1.0939031 , -1.4321265 ],\n       [-0.22598556,  1.6802124 ,  0.5978407 ],\n       [ 1.1740401 , -0.04154727, -0.512898  ],\n       [-0.41957757, -0.24454471,  3.0900123 ],\n       [ 0.7238282 ,  0.52511275,  0.8248042 ],\n       [ 0.05533566, -0.6713925 ,  1.6488242 ],\n       [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n       [-0.0657557 ,  1.8550861 , -2.3939755 ],\n       [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n       [-0.8098082 ,  3.201651  , -0.6507186 ],\n       [ 0.792407  ,  3.368585  ,  0.01799216],\n       [ 2.558414  ,  1.5826052 , -0.9704587 ],\n       [ 2.166226  ,  0.64460325, -2.384977  ],\n       [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n       [-1.1792994 ,  1.1978384 ,  0.34465855],\n       [ 1.8563557 , -0.90775317, -0.5115611 ],\n       [ 0.31435642, -0.42179283, -1.0628686 ],\n       [ 0.42152542,  0.25200853,  3.627957  ],\n       [-0.5416419 , -1.1152233 ,  3.7040234 ],\n       [-1.1868238 ,  0.46580845,  3.0541756 ],\n       [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n       [-0.7720179 , -0.9603249 ,  0.994841  ],\n       [ 1.7518724 , -1.5571898 ,  2.560223  ],\n       [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n       [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32), 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1], dtype=int32), 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0], dtype=int32), 'e0': array([[-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-33939.41501837],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ]]), 'energies': array([-232450.64], dtype=float32), 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]', 'subset': 'PubChem', 'forces': array([[[  2.1335483 ],\n        [-37.241825  ],\n        [ 22.830988  ]],\n\n       [[ 68.235725  ],\n        [ 59.30573   ],\n        [-27.672606  ]],\n\n       [[-34.137283  ],\n        [-30.504696  ],\n        [-33.670048  ]],\n\n       [[-49.57814   ],\n        [-75.2747    ],\n        [ 32.80194   ]],\n\n       [[  8.196513  ],\n        [ 17.132149  ],\n        [-36.84995   ]],\n\n       [[ 67.39872   ],\n        [ -8.923976  ],\n        [-20.772083  ]],\n\n       [[ 45.424217  ],\n        [-33.559574  ],\n        [ 20.30243   ]],\n\n       [[-13.522426  ],\n        [ 79.690094  ],\n        [ 15.531546  ]],\n\n       [[ 35.77895   ],\n        [  1.9324436 ],\n        [ -8.205132  ]],\n\n       [[ -3.3487453 ],\n        [ -7.991125  ],\n        [ -9.71156   ]],\n\n       [[  1.4049193 ],\n        [ 13.497365  ],\n        [ -5.981079  ]],\n\n       [[-21.196207  ],\n        [ 16.861713  ],\n        [ -1.7730864 ]],\n\n       [[-10.805695  ],\n        [ -2.033095  ],\n        [ -4.2524548 ]],\n\n       [[ 35.204765  ],\n        [ 12.971134  ],\n        [ 22.815577  ]],\n\n       [[-11.87403   ],\n        [ 10.404548  ],\n        [ 23.009806  ]],\n\n       [[  2.3782759 ],\n        [ 19.309696  ],\n        [ 15.546526  ]],\n\n       [[ -2.5732849 ],\n        [ -4.098344  ],\n        [ -5.087256  ]],\n\n       [[  3.5987573 ],\n        [ 10.469024  ],\n        [  9.869113  ]],\n\n       [[ -8.646548  ],\n        [ -0.35554707],\n        [  1.7650104 ]],\n\n       [[ -6.6712875 ],\n        [ -0.7742697 ],\n        [-15.672442  ]],\n\n       [[-25.453985  ],\n        [ -9.350726  ],\n        [  6.0056353 ]],\n\n       [[-32.657543  ],\n        [ 10.617167  ],\n        [  2.516469  ]],\n\n       [[-23.541552  ],\n        [ -9.305013  ],\n        [ -9.855984  ]],\n\n       [[  2.8105662 ],\n        [-13.78966   ],\n        [ 10.141727  ]],\n\n       [[-29.951014  ],\n        [ -9.25683   ],\n        [-23.69946   ]],\n\n       [[ -3.412568  ],\n        [  4.13157   ],\n        [ 12.421117  ]],\n\n       [[  4.77353   ],\n        [-13.841051  ],\n        [  7.6428723 ]]], dtype=float32)}\n
    In\u00a0[42]: Copied!
    for i in ds.as_iter(atoms=True):\n    print(i)\n    break\n
    for i in ds.as_iter(atoms=True): print(i) break
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)\n
    In\u00a0[43]: Copied!
    from openqdc.methods import QmMethod\n\n# Get the b3lyp/6-31g* method\nmethod = QmMethod.B3LYP_6_31G_D\nmethod.atom_energies_dict\n
    from openqdc.methods import QmMethod # Get the b3lyp/6-31g* method method = QmMethod.B3LYP_6_31G_D method.atom_energies_dict Out[43]:
    {('H', -1): -0.4618190740256503,\n ('H', 0): -0.5002733301377901,\n ('H', 1): 0.0,\n ('Li', 1): -7.284546111273075,\n ('B', -3): -23.577268753399462,\n ('B', -1): -24.614577395156598,\n ('B', 0): -24.65435524492553,\n ('B', 3): -22.018169862974275,\n ('C', -1): -37.844269871879376,\n ('C', 0): -37.84628033285479,\n ('C', 1): -37.42731164237431,\n ('N', -1): -54.52864356359092,\n ('N', 0): -54.584488815424095,\n ('N', 1): -54.0458621835885,\n ('O', -1): -75.05272792994404,\n ('O', 0): -75.06062109946738,\n ('O', 1): -74.54659271939704,\n ('F', -1): -99.75408410035712,\n ('F', 0): -99.71553471526475,\n ('Na', 1): -162.081235395777,\n ('Mg', 2): -199.22734695613283,\n ('Si', 4): -285.5564410277949,\n ('Si', 0): -289.3717359984153,\n ('Si', -4): -288.02795351148654,\n ('P', 0): -341.2580911838578,\n ('P', 1): -340.8765976669208,\n ('S', -1): -398.16568433994024,\n ('S', 0): -398.1049932797066,\n ('S', 1): -397.7199808615457,\n ('Cl', -2): -459.5066184980746,\n ('Cl', -1): -460.25223446009306,\n ('Cl', 0): -460.13624346967765,\n ('Cl', 2): -458.6740467177361,\n ('K', 1): -599.7247062673807,\n ('Ca', 2): -676.8667395990246,\n ('Br', -1): -2573.824201570383,\n ('Br', 0): -2573.705283744811,\n ('I', -1): None,\n ('I', 0): None}
    In\u00a0[44]: Copied!
    # Get the matrix of atomization energies for the b3lyp/6-31g* method\nmethod.atom_energies_matrix\n
    # Get the matrix of atomization energies for the b3lyp/6-31g* method method.atom_energies_matrix Out[44]:
    array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])
    In\u00a0[45]: Copied!
    import matplotlib.pyplot as plt \nfrom sklearn.decomposition import PCA\ndatum = ds.soap_descriptors(n_samples=500, progress=True)\nreducer = PCA()\nembedding = reducer.fit_transform(datum[\"soap\"])\n
    import matplotlib.pyplot as plt from sklearn.decomposition import PCA datum = ds.soap_descriptors(n_samples=500, progress=True) reducer = PCA() embedding = reducer.fit_transform(datum[\"soap\"])
    100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 500/500 [00:01<00:00, 459.21it/s]\n
    In\u00a0[46]: Copied!
    plt.scatter(\n    embedding[:, 0],\n    embedding[:, 1],\n    c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\nplt.colorbar()\n
    plt.scatter( embedding[:, 0], embedding[:, 1], c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]]) plt.colorbar() Out[46]:
    <matplotlib.colorbar.Colorbar at 0x1554aa7bd820>
    "},{"location":"tutorials/usage.html#openqdc-hands-on-tutorial","title":"OpenQDC Hands-on Tutorial\u00b6","text":""},{"location":"tutorials/usage.html#instantiate-and-go","title":"Instantiate and GO!\u00b6","text":"

    If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go. Change of units is done automatically upon loading based on the units of the dataset.

    Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    Supported distance units: [\"ang\", \"nm\", \"bohr\"]

    "},{"location":"tutorials/usage.html#items-from-the-dataset-object-class-are-obtained-through-the-get-method","title":"Items from the dataset object class are obtained through the \"get\" method.\u00b6","text":"

    The dictionary of the item contains different important keys:

    • 'positions' : numpy array of the 3d atomic positions (n x 3)
    • 'atomic_numbers': numpy array of the atomic numbers (n)
    • 'charges': numpy array of the formal charges for the molecule (n)
    • 'e0': isolated atom energy of the atoms in the molecule (n x n_level_of_theories)
    • 'energies': potential energy of the molecule (n_level_of_theries)
    • 'name': name or smiles (is present) of the molecule
    • 'subset': subset of the dataset the molecule belongs to
    • 'forces': if present, the forces on the atoms (n x 3 x n_level_of_theories_forces)
    "},{"location":"tutorials/usage.html#alternatively-we-can-also-retrieve-the-data-from-the-dataset-object-class-as-aseatoms-using-the-get_ase_atoms","title":"Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!\u00b6","text":""},{"location":"tutorials/usage.html#iterators","title":"Iterators\u00b6","text":"

    The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item.

    "},{"location":"tutorials/usage.html#isolated-atoms-energies-e0s","title":"Isolated atoms energies [e0s]\u00b6","text":"

    The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.

    $U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$

    The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow

    "},{"location":"tutorials/usage.html#chemical-space-from-soap-descriptors","title":"Chemical space from SOAP descriptors\u00b6","text":"

    openQDC offer a simple way to calculate the Smooth Overlaps of Atomic Positions (SOAP) descriptors for the molecules in the dataset. The method get_soap_descriptors returns the SOAP descriptors for the molecules in the dataset.

    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"

    OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.

    • \ud83d\udc0d Simple pythonic API
    • \ud83d\udd79\ufe0f ML-Ready: all you manipulate are torch.Tensor,jax.Array or numpy.Arrayobjects.
    • \u269b\ufe0f Quantum Ready: The quantum methods are checked and standardized to provide addictional values.
    • \u2705 Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels.
    • \ud83e\udde0 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
    • \ud83d\udcc8 Data: have access to 1.5+ billion datapoints

    Visit our website at https://openqdc.io .

    "},{"location":"index.html#installation","title":"Installation","text":"

    Use mamba:

    conda install -c conda-forge openqdc\n

    Tips: You can replace conda by mamba.

    Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: pip install openqdc.

    "},{"location":"index.html#quick-api-tour","title":"Quick API Tour","text":"
    from openqdc as Spice\n\n# Load the original dataset\ndataset = Spice()\n\n# Load the dataset with a different units\ndataset = Spice(\n    energy_unit = \"kcal/mol\",\n    distance_unit = \"ang\",\n    energy_type = \"formation\",\n    array_format = \"torch\"\n)\n\n# Access the data\ndata = dataset[0]\n\n# Get relevant statistics\ndataset.get_statistics()\n\n# Get dataset metadata\ndataset.average_n_atoms\ndataset.chemical_species\ndataset.charges\n\n# Compute physical descriptors\ndataset.calculate_descriptors(\n    descriptor_name = \"soap\"\n)\n
    "},{"location":"index.html#how-to-cite","title":"How to cite","text":"

    Please cite OpenQDC if you use it in your research: .

    "},{"location":"index.html#compatibilities","title":"Compatibilities","text":"

    OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.

    "},{"location":"cli.html","title":"CLI for dataset downloading and uploading","text":"

    You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).

    "},{"location":"cli.html#datasets","title":"Datasets","text":"

    Print a formatted table of the available openQDC datasets and some informations.

    Usage:

    openqdc datasets [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#cache","title":"Cache","text":"

    Get the current local cache path of openQDC

    Usage:

    openqdc cache [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#download","title":"Download","text":"

    Download preprocessed ml-ready datasets from the main openQDC hub.

    Usage:

    openqdc download DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n--as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]\n--gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]\n

    Example:

    openqdc download Spice\n
    "},{"location":"cli.html#fetch","title":"Fetch","text":"

    Download the raw datasets files from the main openQDC hub

    Note:

    Special case: if the dataset is \"all\", \"potential\", \"interaction\".\n

    Usage:

    openqdc fetch DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n

    Example:

    openqdc fetch Spice\n
    "},{"location":"cli.html#preprocess","title":"Preprocess","text":"

    Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.

    Usage:

    openqdc preprocess DATASETS... [OPTIONS]\n

    Options:

    --help         Show this message and exit.\n--overwrite    Whether to overwrite the current cached datasets. [default: overwrite]\n--upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]\n--as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]\n

    Example:

    openqdc preprocess Spice QMugs\n
    "},{"location":"cli.html#upload","title":"Upload","text":"

    Upload a preprocessed dataset to the remote storage

    Usage:

    openqdc upload DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]\n--as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]\n

    Example:

    openqdc upload Spice --overwrite\n
    "},{"location":"cli.html#convert","title":"Convert","text":"

    Convert a preprocessed dataset from a memmap dataset to a zarr dataset.

    Usage:

    openqdc convert DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]\n--download      Whether to force the re-download of the memmap datasets. [default: no-download]\n
    "},{"location":"contribute.html","title":"Contribute","text":"

    The below documents the development lifecycle of OpenQDC.

    "},{"location":"contribute.html#setup-a-dev-environment","title":"Setup a dev environment","text":"
    mamba env create -n openqdc -f env.yml\nmamba activate datamol\npip install -e .\n
    "},{"location":"contribute.html#pre-commit-installation","title":"Pre commit installation","text":"
    pre-commit install\npre-commit run --all-files\n
    "},{"location":"contribute.html#continuous-integration","title":"Continuous Integration","text":"

    OpenQDC uses Github Actions to:

    • Build and test openQDC.
      • Multiple combinations of OS and Python versions are tested.
    • Check the code:
      • Formatting with black.
      • Static type check with mypy.
      • Modules import formatting with isort.
      • Pre-commit hooks.
    • Documentation:
      • Google docstring format.
      • build and deploy the documentation on main and for every new git tag.
    "},{"location":"contribute.html#run-tests","title":"Run tests","text":"
    pytest\n
    "},{"location":"contribute.html#build-the-documentation","title":"Build the documentation","text":"

    You can build and serve the documentation locally with:

    # Build and serve the doc\nmike serve\n

    or with

    mkdocs serve\n
    "},{"location":"contribute.html#multi-versionning","title":"Multi-versionning","text":"

    The doc is built for eash push on main and every git tags using mike. Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.

    "},{"location":"data_storage.html","title":"Data structure","text":""},{"location":"data_storage.html#dataset-structure","title":"Dataset structure","text":"

    For a dataset with N geometries, M atoms across all geometries, ne energy labels, and nf force labels, we use zarr or memory-mapped arrays of various sizes:

    • (M, 5) for atomic numbers (1), charges (1), and positions (3) of individual geometries;
    • (N, 2) for the beginning and end indices of each geometry in the previous array;
    • (N, ne) for the energy labels of each geometry, extendable to store other geometry-level QM properties such as HOMO-LUMO gap;
    • (M, nf , 3) for the force labels of each geometry, extendable to store other atom-level QM properties.

    The memory-mapped files efficiently access data stored on disk or in the cloud without reading them into memory, enabling training on machines with smaller RAM than the dataset size and accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, batching and iteration.

    "},{"location":"data_storage.html#formats","title":"Formats","text":"

    We currently support the following formats:

    1) Zarr : https://zarr.readthedocs.io/en/stable/index.html

    2) Memmap : https://numpy.org/doc/stable/index.html

    "},{"location":"dataset_upload.html","title":"How to Add a Dataset to OpenQDC","text":"

    Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:

    1. Opening a PR to add a new dataset
    2. Request a new dataset through Google Form
    "},{"location":"dataset_upload.html#openqdc-pr-guidelines","title":"OpenQDC PR Guidelines","text":"

    Implement your dataset in the OpenQDC repository by following the guidelines below:

    "},{"location":"dataset_upload.html#dataset-class","title":"Dataset class","text":"
    • The dataset class should be implemented in the openqdc/datasets directory.
    • The dataset class should inherit from the openqdc.datasets.base.BaseDataset class.
    • Add your dataset.py file to the openqdc/datasets/potential or openqdc/datasets/interaction/ directory based on the type of energy.
    • Implement the following for your dataset:
      • Add the metadata of the dataset:
        • Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset.
        • __links__: Dictionary of name and link to download the dataset.
        • __name__: Name of the dataset. This will create a folder with the name of the dataset in the cache directory.
        • The original units for the dataset __energy_unit__ and __distance_unit__.
        • __force_mask__: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans.
        • __energy_methods__: List of the QmMethod methods present in the dataset.
      • read_raw_entries(self) -> List[Dict[str, Any]]: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys:
        • atomic_inputs : Atomic inputs of the molecule. numpy.Float32.
        • name: Atomic numbers of the atoms in the molecule. numpy.Object.
        • subset: Positions of the atoms in the molecule. numpy.Object.
        • energies: Energies of the molecule. numpy.Float64.
        • n_atoms: Number of atoms in the molecule. numpy.Int32
        • forces: Forces of the molecule. [Optional] numpy.Float32.
      • Add the dataset import to the openqdc/datasets/<type_of_dataset>/__init__.py file and to openqdc/__init__.py.
    "},{"location":"dataset_upload.html#test-the-dataset","title":"Test the dataset","text":"

    Try to run the openQDC CLI pipeline with the dataset you implemented.

    Run the following command to download the dataset:

    • Fetch the dataset files
      openqdc fetch DATASET_NAME\n
    • Preprocess the dataset
      openqdc preprocess DATASET_NAME\n
    • Load it on python and check if the dataset is correctly loaded.
      from openqdc import DATASET_NAME\nds=DATASET_NAME()\n

    If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.

    • Select for your PR the dataset label.

    Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.

    "},{"location":"dataset_upload.html#openqdc-google-form","title":"OpenQDC Google Form","text":"

    Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. You can fill out the Google Form here

    As the openQDC team will strive to provide a high quality curation and upload, please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.

    "},{"location":"datasets.html","title":"Overview of Datasets","text":"

    We provide support for the following publicly available QM Datasets.

    Dataset # Molecules # Conformers Average Conformers per Molecule Force Labels Atom Types QM Level of Theory Off-Equilibrium Conformations GEOM 450,000 37,000,000 82 No 18 GFN2-xTB No Molecule3D 3,899,647 3,899,647 1 No 5 B3LYP/6-31G* No NablaDFT 1,000,000 5,000,000 5 No 6 \u03c9B97X-D/def2-SVP QMugs 665,000 2,000,000 3 No 10 GFN2-xTB, \u03c9B97X-D/def2-SVP No Spice 19,238 1,132,808 59 Yes 15 \u03c9B97M-D3(BJ)/def2-TZVPPD Yes ANI 57,462 20,000,000 348 No 4 \u03c9B97x:6-31G(d) Yes tmQM 86,665 No TPSSh-D3BJ/def2-SVP DES370K 3,700 370,000 100 No 20 CCSD(T) Yes DES5M 3,700 5,000,000 1351 No 20 SNS-MP2 Yes OrbNet Denali 212,905 2,300,000 11 No 16 GFN1-xTB Yes SN2RXN 39 452709 11,600 Yes 6 DSD-BLYP-D3(BJ)/def2-TZVP QM7X 6,950 4,195,237 603 Yes 7 PBE0+MBD Yes"},{"location":"licensing.html","title":"License","text":"
    Creative Commons Attribution-NonCommercial 4.0 International\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright and\ncertain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n-   Considerations for licensors: Our public licenses are intended for\n    use by those authorized to give the public permission to use\n    material in ways otherwise restricted by copyright and certain other\n    rights. Our licenses are irrevocable. Licensors should read and\n    understand the terms and conditions of the license they choose\n    before applying it. Licensors should also secure all rights\n    necessary before applying our licenses so that the public can reuse\n    the material as expected. Licensors should clearly mark any material\n    not subject to the license. This includes other CC-licensed\n    material, or material used under an exception or limitation to\n    copyright. More considerations for licensors :\n    wiki.creativecommons.org/Considerations\\_for\\_licensors\n\n-   Considerations for the public: By using one of our public licenses,\n    a licensor grants the public permission to use the licensed material\n    under specified terms and conditions. If the licensor's permission\n    is not necessary for any reason\u2013for example, because of any\n    applicable exception or limitation to copyright\u2013then that use is not\n    regulated by the license. Our licenses grant only permissions under\n    copyright and certain other rights that a licensor has authority to\n    grant. Use of the licensed material may still be restricted for\n    other reasons, including because others have copyright or other\n    rights in the material. A licensor may make special requests, such\n    as asking that all changes be marked or described. Although not\n    required by our licenses, you are encouraged to respect those\n    requests where reasonable. More considerations for the public :\n    wiki.creativecommons.org/Considerations\\_for\\_licensees\n\nCreative Commons Attribution-NonCommercial 4.0 International Public\nLicense\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial 4.0 International Public License (\"Public\nLicense\"). To the extent this Public License may be interpreted as a\ncontract, You are granted the Licensed Rights in consideration of Your\nacceptance of these terms and conditions, and the Licensor grants You\nsuch rights in consideration of benefits the Licensor receives from\nmaking the Licensed Material available under these terms and conditions.\n\n-   Section 1 \u2013 Definitions.\n\n    -   a. Adapted Material means material subject to Copyright and\n        Similar Rights that is derived from or based upon the Licensed\n        Material and in which the Licensed Material is translated,\n        altered, arranged, transformed, or otherwise modified in a\n        manner requiring permission under the Copyright and Similar\n        Rights held by the Licensor. For purposes of this Public\n        License, where the Licensed Material is a musical work,\n        performance, or sound recording, Adapted Material is always\n        produced where the Licensed Material is synched in timed\n        relation with a moving image.\n    -   b. Adapter's License means the license You apply to Your\n        Copyright and Similar Rights in Your contributions to Adapted\n        Material in accordance with the terms and conditions of this\n        Public License.\n    -   c. Copyright and Similar Rights means copyright and/or similar\n        rights closely related to copyright including, without\n        limitation, performance, broadcast, sound recording, and Sui\n        Generis Database Rights, without regard to how the rights are\n        labeled or categorized. For purposes of this Public License, the\n        rights specified in Section 2(b)(1)-(2) are not Copyright and\n        Similar Rights.\n    -   d. Effective Technological Measures means those measures that,\n        in the absence of proper authority, may not be circumvented\n        under laws fulfilling obligations under Article 11 of the WIPO\n        Copyright Treaty adopted on December 20, 1996, and/or similar\n        international agreements.\n    -   e. Exceptions and Limitations means fair use, fair dealing,\n        and/or any other exception or limitation to Copyright and\n        Similar Rights that applies to Your use of the Licensed\n        Material.\n    -   f. Licensed Material means the artistic or literary work,\n        database, or other material to which the Licensor applied this\n        Public License.\n    -   g. Licensed Rights means the rights granted to You subject to\n        the terms and conditions of this Public License, which are\n        limited to all Copyright and Similar Rights that apply to Your\n        use of the Licensed Material and that the Licensor has authority\n        to license.\n    -   h. Licensor means the individual(s) or entity(ies) granting\n        rights under this Public License.\n    -   i. NonCommercial means not primarily intended for or directed\n        towards commercial advantage or monetary compensation. For\n        purposes of this Public License, the exchange of the Licensed\n        Material for other material subject to Copyright and Similar\n        Rights by digital file-sharing or similar means is NonCommercial\n        provided there is no payment of monetary compensation in\n        connection with the exchange.\n    -   j. Share means to provide material to the public by any means or\n        process that requires permission under the Licensed Rights, such\n        as reproduction, public display, public performance,\n        distribution, dissemination, communication, or importation, and\n        to make material available to the public including in ways that\n        members of the public may access the material from a place and\n        at a time individually chosen by them.\n    -   k. Sui Generis Database Rights means rights other than copyright\n        resulting from Directive 96/9/EC of the European Parliament and\n        of the Council of 11 March 1996 on the legal protection of\n        databases, as amended and/or succeeded, as well as other\n        essentially equivalent rights anywhere in the world.\n    -   l. You means the individual or entity exercising the Licensed\n        Rights under this Public License. Your has a corresponding\n        meaning.\n\n-   Section 2 \u2013 Scope.\n\n    -   a. License grant.\n        -   1. Subject to the terms and conditions of this Public\n            License, the Licensor hereby grants You a worldwide,\n            royalty-free, non-sublicensable, non-exclusive, irrevocable\n            license to exercise the Licensed Rights in the Licensed\n            Material to:\n            -   A. reproduce and Share the Licensed Material, in whole\n                or in part, for NonCommercial purposes only; and\n            -   B. produce, reproduce, and Share Adapted Material for\n                NonCommercial purposes only.\n        -   2. Exceptions and Limitations. For the avoidance of doubt,\n            where Exceptions and Limitations apply to Your use, this\n            Public License does not apply, and You do not need to comply\n            with its terms and conditions.\n        -   3. Term. The term of this Public License is specified in\n            Section 6(a).\n        -   4. Media and formats; technical modifications allowed. The\n            Licensor authorizes You to exercise the Licensed Rights in\n            all media and formats whether now known or hereafter\n            created, and to make technical modifications necessary to do\n            so. The Licensor waives and/or agrees not to assert any\n            right or authority to forbid You from making technical\n            modifications necessary to exercise the Licensed Rights,\n            including technical modifications necessary to circumvent\n            Effective Technological Measures. For purposes of this\n            Public License, simply making modifications authorized by\n            this Section 2(a)(4) never produces Adapted Material.\n        -   5. Downstream recipients.\n            -   A. Offer from the Licensor \u2013 Licensed Material. Every\n                recipient of the Licensed Material automatically\n                receives an offer from the Licensor to exercise the\n                Licensed Rights under the terms and conditions of this\n                Public License.\n            -   B. No downstream restrictions. You may not offer or\n                impose any additional or different terms or conditions\n                on, or apply any Effective Technological Measures to,\n                the Licensed Material if doing so restricts exercise of\n                the Licensed Rights by any recipient of the Licensed\n                Material.\n        -   6. No endorsement. Nothing in this Public License\n            constitutes or may be construed as permission to assert or\n            imply that You are, or that Your use of the Licensed\n            Material is, connected with, or sponsored, endorsed, or\n            granted official status by, the Licensor or others\n            designated to receive attribution as provided in Section\n            3(a)(1)(A)(i).\n    -   b. Other rights.\n        -   1. Moral rights, such as the right of integrity, are not\n            licensed under this Public License, nor are publicity,\n            privacy, and/or other similar personality rights; however,\n            to the extent possible, the Licensor waives and/or agrees\n            not to assert any such rights held by the Licensor to the\n            limited extent necessary to allow You to exercise the\n            Licensed Rights, but not otherwise.\n        -   2. Patent and trademark rights are not licensed under this\n            Public License.\n        -   3. To the extent possible, the Licensor waives any right to\n            collect royalties from You for the exercise of the Licensed\n            Rights, whether directly or through a collecting society\n            under any voluntary or waivable statutory or compulsory\n            licensing scheme. In all other cases the Licensor expressly\n            reserves any right to collect such royalties, including when\n            the Licensed Material is used other than for NonCommercial\n            purposes.\n\n-   Section 3 \u2013 License Conditions.\n\n    Your exercise of the Licensed Rights is expressly made subject to\n    the following conditions.\n\n    -   a. Attribution.\n        -   1. If You Share the Licensed Material (including in modified\n            form), You must:\n            -   A. retain the following if it is supplied by the\n                Licensor with the Licensed Material:\n                -   i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if designated);\n                -   ii. a copyright notice;\n                -   iii. a notice that refers to this Public License;\n                -   iv. a notice that refers to the disclaimer of\n                    warranties;\n                -   v. a URI or hyperlink to the Licensed Material to\n                    the extent reasonably practicable;\n            -   B. indicate if You modified the Licensed Material and\n                retain an indication of any previous modifications; and\n            -   C. indicate the Licensed Material is licensed under this\n                Public License, and include the text of, or the URI or\n                hyperlink to, this Public License.\n        -   2. You may satisfy the conditions in Section 3(a)(1) in any\n            reasonable manner based on the medium, means, and context in\n            which You Share the Licensed Material. For example, it may\n            be reasonable to satisfy the conditions by providing a URI\n            or hyperlink to a resource that includes the required\n            information.\n        -   3. If requested by the Licensor, You must remove any of the\n            information required by Section 3(a)(1)(A) to the extent\n            reasonably practicable.\n        -   4. If You Share Adapted Material You produce, the Adapter's\n            License You apply must not prevent recipients of the Adapted\n            Material from complying with this Public License.\n\n-   Section 4 \u2013 Sui Generis Database Rights.\n\n    Where the Licensed Rights include Sui Generis Database Rights that\n    apply to Your use of the Licensed Material:\n\n    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the\n        right to extract, reuse, reproduce, and Share all or a\n        substantial portion of the contents of the database for\n        NonCommercial purposes only;\n    -   b. if You include all or a substantial portion of the database\n        contents in a database in which You have Sui Generis Database\n        Rights, then the database in which You have Sui Generis Database\n        Rights (but not its individual contents) is Adapted Material;\n        and\n    -   c. You must comply with the conditions in Section 3(a) if You\n        Share all or a substantial portion of the contents of the\n        database.\n\n    For the avoidance of doubt, this Section 4 supplements and does not\n    replace Your obligations under this Public License where the\n    Licensed Rights include other Copyright and Similar Rights.\n\n-   Section 5 \u2013 Disclaimer of Warranties and Limitation of Liability.\n\n    -   a. Unless otherwise separately undertaken by the Licensor, to\n        the extent possible, the Licensor offers the Licensed Material\n        as-is and as-available, and makes no representations or\n        warranties of any kind concerning the Licensed Material, whether\n        express, implied, statutory, or other. This includes, without\n        limitation, warranties of title, merchantability, fitness for a\n        particular purpose, non-infringement, absence of latent or other\n        defects, accuracy, or the presence or absence of errors, whether\n        or not known or discoverable. Where disclaimers of warranties\n        are not allowed in full or in part, this disclaimer may not\n        apply to You.\n    -   b. To the extent possible, in no event will the Licensor be\n        liable to You on any legal theory (including, without\n        limitation, negligence) or otherwise for any direct, special,\n        indirect, incidental, consequential, punitive, exemplary, or\n        other losses, costs, expenses, or damages arising out of this\n        Public License or use of the Licensed Material, even if the\n        Licensor has been advised of the possibility of such losses,\n        costs, expenses, or damages. Where a limitation of liability is\n        not allowed in full or in part, this limitation may not apply to\n        You.\n    -   c. The disclaimer of warranties and limitation of liability\n        provided above shall be interpreted in a manner that, to the\n        extent possible, most closely approximates an absolute\n        disclaimer and waiver of all liability.\n\n-   Section 6 \u2013 Term and Termination.\n\n    -   a. This Public License applies for the term of the Copyright and\n        Similar Rights licensed here. However, if You fail to comply\n        with this Public License, then Your rights under this Public\n        License terminate automatically.\n    -   b. Where Your right to use the Licensed Material has terminated\n        under Section 6(a), it reinstates:\n\n        -   1. automatically as of the date the violation is cured,\n            provided it is cured within 30 days of Your discovery of the\n            violation; or\n        -   2. upon express reinstatement by the Licensor.\n\n        For the avoidance of doubt, this Section 6(b) does not affect\n        any right the Licensor may have to seek remedies for Your\n        violations of this Public License.\n\n    -   c. For the avoidance of doubt, the Licensor may also offer the\n        Licensed Material under separate terms or conditions or stop\n        distributing the Licensed Material at any time; however, doing\n        so will not terminate this Public License.\n    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n        License.\n\n-   Section 7 \u2013 Other Terms and Conditions.\n\n    -   a. The Licensor shall not be bound by any additional or\n        different terms or conditions communicated by You unless\n        expressly agreed.\n    -   b. Any arrangements, understandings, or agreements regarding the\n        Licensed Material not stated herein are separate from and\n        independent of the terms and conditions of this Public License.\n\n-   Section 8 \u2013 Interpretation.\n\n    -   a. For the avoidance of doubt, this Public License does not, and\n        shall not be interpreted to, reduce, limit, restrict, or impose\n        conditions on any use of the Licensed Material that could\n        lawfully be made without permission under this Public License.\n    -   b. To the extent possible, if any provision of this Public\n        License is deemed unenforceable, it shall be automatically\n        reformed to the minimum extent necessary to make it enforceable.\n        If the provision cannot be reformed, it shall be severed from\n        this Public License without affecting the enforceability of the\n        remaining terms and conditions.\n    -   c. No term or condition of this Public License will be waived\n        and no failure to comply consented to unless expressly agreed to\n        by the Licensor.\n    -   d. Nothing in this Public License constitutes or may be\n        interpreted as a limitation upon, or waiver of, any privileges\n        and immunities that apply to the Licensor or You, including from\n        the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding,\nCreative Commons may elect to apply one of its public licenses to\nmaterial it publishes and in those instances will be considered the\n\"Licensor.\" The text of the Creative Commons public licenses is\ndedicated to the public domain under the CC0 Public Domain Dedication.\nExcept for the limited purpose of indicating that material is shared\nunder a Creative Commons public license or as otherwise permitted by the\nCreative Commons policies published at creativecommons.org/policies,\nCreative Commons does not authorize the use of the trademark \"Creative\nCommons\" or any other trademark or logo of Creative Commons without its\nprior written consent including, without limitation, in connection with\nany unauthorized modifications to any of its public licenses or any\nother arrangements, understandings, or agreements concerning use of\nlicensed material. For the avoidance of doubt, this paragraph does not\nform part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n
    "},{"location":"normalization_e0s.html","title":"Overview of QM Methods and Normalization","text":"

    OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize the usage of different level of theories used for Quantum Mechanics Single Point Calculations to add value and information to the datasets.

    "},{"location":"normalization_e0s.html#level-of-theory","title":"Level of Theory","text":"

    To avoid inconsistencies, level of theories are standardized and categorized into Python Enums consisting of a functional, a basis set, and a correction method. OpenQDC covers more than 106 functionals, 20 basis sets, and 11 correction methods. OpenQDC provides the computed the isolated atom energies e0 for each QM method.

    "},{"location":"normalization_e0s.html#normalization","title":"Normalization","text":"

    We provide support of energies through \"physical\" and \"regression\" normalization to conserve the size extensivity of chemical systems. OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies e0 physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- compute the average contribution of each atom species to potential energy via linear or ridge regression, centering the distribution at 0 and providing uncertainty estimation for the computed values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.

    "},{"location":"normalization_e0s.html#physical-normalization","title":"Physical Normalization","text":"

    e0 energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from the potential energy to obtain the atomization energy. This normalization method is physically interpretable and only remove the atom energy contribution from the potential energy.

    "},{"location":"normalization_e0s.html#regression-normalization","title":"Regression Normalization","text":"

    e0 energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. The e0 energies are then subtracted from the potential energy to obtain the atomization energy. This normalization provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. The resulting formation energy is centered at 0.

    "},{"location":"usage.html","title":"Usage","text":""},{"location":"usage.html#how-to-use","title":"How to use","text":"

    OpenQDC has been designed to be used with a single import:

    import openqdc as qdc\ndataset = qdc.QM9()\n

    All openQDC functions are available under qdc. Or if you want to directly import a specific dataset:

    from openqdc as Spice\n# Spice dataset with distance unit in angstrom instead of bohr\ndataset = Spice(distance_unit=\"ang\",\n                array_format = \"jax\"\n)\ndataset[0] # dict of jax array\n

    Or if you prefer handling ase.Atoms objects:

    dataset.get_ase_atoms(0)\n
    "},{"location":"usage.html#iterators","title":"Iterators","text":"

    OpenQDC provides a simple way to get the data as iterators:

    for data in dataset.as_iter(atoms=True):\n    print(data) # Atoms object\n    break\n

    or if you want to just iterate over the data:

    for data in dataset:\n    print(data) # dict of arrays\n    break\n
    "},{"location":"usage.html#lazy-loading","title":"Lazy loading","text":"

    OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during import openqdc as qdc. In case of trouble you can always disable lazy loading by setting the environment variable OPENQDC_DISABLE_LAZY_LOADING to 1.

    "},{"location":"API/basedataset.html","title":"BaseDataset","text":"

    The BaseDataset defining shared functionality between all datasets.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset","title":"BaseDataset","text":"

    Bases: DatasetPropertyMixIn

    Base class for datasets in the openQDC package.

    Source code in openqdc/datasets/base.py
    class BaseDataset(DatasetPropertyMixIn):\n    \"\"\"\n    Base class for datasets in the openQDC package.\n    \"\"\"\n\n    energy_target_names = []\n    force_target_names = []\n    read_as_zarr = False\n    __energy_methods__ = []\n    __force_mask__ = []\n    __isolated_atom_energies__ = []\n    _fn_energy = lambda x: x\n    _fn_distance = lambda x: x\n    _fn_forces = lambda x: x\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __average_nb_atoms__ = None\n    __links__ = {}\n\n    def __init__(\n        self,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n        array_format: str = \"numpy\",\n        energy_type: Optional[str] = \"formation\",\n        overwrite_local_cache: bool = False,\n        cache_dir: Optional[str] = None,\n        recompute_statistics: bool = False,\n        transform: Optional[Callable] = None,\n        skip_statistics: bool = False,\n        read_as_zarr: bool = False,\n        regressor_kwargs: Dict = {\n            \"solver_type\": \"linear\",\n            \"sub_sample\": None,\n            \"stride\": 1,\n        },\n    ) -> None:\n        \"\"\"\n\n        Parameters:\n            energy_unit:\n                Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n            distance_unit:\n                Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n            array_format:\n                Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n            energy_type:\n                Type of isolated atom energy to use for the dataset. Default: \"formation\"\n                Supported types: [\"formation\", \"regression\", \"null\", None]\n            overwrite_local_cache:\n                Whether to overwrite the locally cached dataset.\n            cache_dir:\n                Cache directory location. Defaults to \"~/.cache/openqdc\"\n            recompute_statistics:\n                Whether to recompute the statistics of the dataset.\n            transform:\n                transformation to apply to the __getitem__ calls\n            regressor_kwargs:\n                Dictionary of keyword arguments to pass to the regressor.\n                Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n                solver_type can be one of [\"linear\", \"ridge\"]\n        \"\"\"\n        set_cache_dir(cache_dir)\n        # self._init_lambda_fn()\n        self.data = None\n        self._original_unit = self.energy_unit\n        self.recompute_statistics = recompute_statistics\n        self.regressor_kwargs = regressor_kwargs\n        self.transform = transform\n        self.read_as_zarr = read_as_zarr\n        self.energy_type = energy_type if energy_type is not None else \"null\"\n        self.refit_e0s = recompute_statistics or overwrite_local_cache\n        self.skip_statistics = skip_statistics\n        if not self.is_preprocessed():\n            raise DatasetNotAvailableError(self.__name__)\n        else:\n            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n        self.set_array_format(array_format)\n        self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n\n    def _init_lambda_fn(self):\n        self._fn_energy = lambda x: x\n        self._fn_distance = lambda x: x\n        self._fn_forces = lambda x: x\n\n    @property\n    def dataset_wrapper(self):\n        if not hasattr(self, \"_dataset_wrapper\"):\n            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()\n        return self._dataset_wrapper\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=self.__name__, links=self.__links__)\n\n    @classmethod\n    def fetch(cls, cache_path: Optional[str] = None, overwrite: bool = False) -> None:\n        from openqdc.utils.download_api import DataDownloader\n\n        DataDownloader(cache_path, overwrite).from_config(cls.no_init().config)\n\n    def _post_init(\n        self,\n        overwrite_local_cache: bool = False,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n    ) -> None:\n        self._set_units(None, None)\n        self._set_isolated_atom_energies()\n        if not self.skip_statistics:\n            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)\n        self._set_units(energy_unit, distance_unit)\n        self._convert_data()\n        self._set_isolated_atom_energies()\n\n    def _precompute_statistics(self, overwrite_local_cache: bool = False):\n        # if self.recompute_statistics or overwrite_local_cache:\n        self.statistics = StatisticManager(\n            self,\n            self.recompute_statistics or overwrite_local_cache,  # check if we need to recompute\n            # Add the common statistics (Forces, TotalE, FormE, PerAtomE)\n            ForcesCalculatorStats,\n            TotalEnergyStats,\n            FormationEnergyStats,\n            PerAtomFormationEnergyStats,\n        )\n        self.statistics.run_calculators()  # run the calculators\n        self._compute_average_nb_atoms()\n\n    @classmethod\n    def no_init(cls):\n        \"\"\"\n        Class method to avoid the __init__ method to be called when the class is instanciated.\n        Useful for debugging purposes or preprocessing data.\n        \"\"\"\n        return cls.__new__(cls)\n\n    @property\n    def __force_methods__(self):\n        \"\"\"\n        For backward compatibility. To be removed in the future.\n        \"\"\"\n        return self.force_methods\n\n    @property\n    def energy_methods(self) -> List[str]:\n        \"\"\"Return the string version of the energy methods\"\"\"\n        return [str(i) for i in self.__energy_methods__]\n\n    @property\n    def force_mask(self):\n        if len(self.__class__.__force_mask__) == 0:\n            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)\n        return self.__class__.__force_mask__\n\n    @property\n    def force_methods(self):\n        return list(compress(self.energy_methods, self.force_mask))\n\n    @property\n    def e0s_dispatcher(self) -> AtomEnergies:\n        \"\"\"\n        Property to get the object that dispatched the isolated atom energies of the QM methods.\n\n        Returns:\n            Object wrapping the isolated atom energies of the QM methods.\n        \"\"\"\n        if not hasattr(self, \"_e0s_dispatcher\"):\n            # Automatically fetch/compute formation or regression energies\n            self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)\n        return self._e0s_dispatcher\n\n    def _convert_data(self):\n        logger.info(\n            f\"Converting {self.__name__} data to the following units:\\n\\\n                     Energy: {str(self.energy_unit)},\\n\\\n                     Distance: {str(self.distance_unit)},\\n\\\n                     Forces: {str(self.force_unit) if self.__force_methods__ else 'None'}\"\n        )\n        for key in self.data_keys:\n            self.data[key] = self._convert_on_loading(self.data[key], key)\n\n    @property\n    def energy_unit(self):\n        return EnergyTypeConversion(self.__energy_unit__)\n\n    @property\n    def distance_unit(self):\n        return DistanceTypeConversion(self.__distance_unit__)\n\n    @property\n    def force_unit(self):\n        units = self.__forces_unit__.split(\"/\")\n        if len(units) > 2:\n            units = [\"/\".join(units[:2]), units[-1]]\n        return ForceTypeConversion(tuple(units))  # < 3.12 compatibility\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), self.__name__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def data_keys(self):\n        keys = list(self.data_types.keys())\n        if len(self.__force_methods__) == 0:\n            keys.remove(\"forces\")\n        return keys\n\n    @property\n    def pkl_data_keys(self):\n        return list(self.pkl_data_types.keys())\n\n    @property\n    def pkl_data_types(self):\n        return {\"name\": str, \"subset\": str, \"n_atoms\": np.int32}\n\n    @property\n    def atom_energies(self):\n        return self._e0s_dispatcher\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float64,\n            \"forces\": np.float32,\n        }\n\n    @property\n    def data_shapes(self):\n        return {\n            \"atomic_inputs\": (-1, NB_ATOMIC_FEATURES),\n            \"position_idx_range\": (-1, 2),\n            \"energies\": (-1, len(self.energy_methods)),\n            \"forces\": (-1, 3, len(self.force_methods)),\n        }\n\n    def _set_units(self, en: Optional[str] = None, ds: Optional[str] = None):\n        old_en, old_ds = self.energy_unit, self.distance_unit\n        en = en if en is not None else old_en\n        ds = ds if ds is not None else old_ds\n        self.set_energy_unit(en)\n        self.set_distance_unit(ds)\n        if self.__force_methods__:\n            self._fn_forces = self.force_unit.to(str(self.energy_unit), str(self.distance_unit))\n            self.__forces_unit__ = str(self.energy_unit) + \"/\" + str(self.distance_unit)\n\n    def _set_isolated_atom_energies(self):\n        if self.__energy_methods__ is None:\n            logger.error(\"No energy methods defined for this dataset.\")\n        if self.energy_type == \"formation\":\n            f = get_conversion(\"hartree\", self.__energy_unit__)\n        else:\n            # regression are calculated on the original unit of the dataset\n            f = self._original_unit.to(self.energy_unit)\n        self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix)\n\n    def convert_energy(self, x):\n        return self._fn_energy(x)\n\n    def convert_distance(self, x):\n        return self._fn_distance(x)\n\n    def convert_forces(self, x):\n        return self._fn_forces(x)\n\n    def set_energy_unit(self, value: str):\n        \"\"\"\n        Set a new energy unit for the dataset.\n\n        Parameters:\n            value:\n                New energy unit to set.\n        \"\"\"\n        # old_unit = self.energy_unit\n        # self.__energy_unit__ = value\n        self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n        self.__energy_unit__ = value\n\n    def set_distance_unit(self, value: str):\n        \"\"\"\n        Set a new distance unit for the dataset.\n\n        Parameters:\n            value:\n                New distance unit to set.\n        \"\"\"\n        # old_unit = self.distance_unit\n        # self.__distance_unit__ = value\n        self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n        self.__distance_unit__ = value\n\n    def set_array_format(self, format: str):\n        assert format in [\"numpy\", \"torch\", \"jax\"], f\"Format {format} not supported.\"\n        self.array_format = format\n\n    def read_raw_entries(self):\n        \"\"\"\n        Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n        \"\"\"\n        raise NotImplementedError\n\n    def collate_list(self, list_entries: List[Dict]) -> Dict:\n        \"\"\"\n        Collate a list of entries into a single dictionary.\n\n        Parameters:\n            list_entries:\n                List of dictionaries containing the entries to collate.\n\n        Returns:\n            Dictionary containing the collated entries.\n        \"\"\"\n        # concatenate entries\n        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n        csum = np.cumsum(res.get(\"n_atoms\"))\n        x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n        x[1:, 0], x[:, 1] = csum[:-1], csum\n        res[\"position_idx_range\"] = x\n\n        return res\n\n    def save_preprocess(\n        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n    ):\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n        Parameters:\n            data_dict:\n                Dictionary containing the preprocessed data.\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                Whether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n        \"\"\"\n        # save memmaps\n        logger.info(\"Preprocessing data and saving it to cache.\")\n        paths = self.dataset_wrapper.save_preprocess(\n            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n        )\n        if upload:\n            for local_path in paths:\n                push_remote(local_path, overwrite=overwrite)  # make it async?\n\n    def read_preprocess(self, overwrite_local_cache=False):\n        logger.info(\"Reading preprocessed data.\")\n        logger.info(\n            f\"Dataset {self.__name__} with the following units:\\n\\\n                     Energy: {self.energy_unit},\\n\\\n                     Distance: {self.distance_unit},\\n\\\n                     Forces: {self.force_unit if self.force_methods else 'None'}\"\n        )\n\n        self.data = self.dataset_wrapper.load_data(\n            self.preprocess_path,\n            self.data_keys,\n            self.data_types,\n            self.data_shapes,\n            self.pkl_data_keys,\n            overwrite_local_cache,\n        )  # this should be async if possible\n        for key in self.data:\n            logger.info(f\"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}\")\n\n    def _convert_on_loading(self, x, key):\n        if key == \"energies\":\n            return self.convert_energy(x)\n        elif key == \"forces\":\n            return self.convert_forces(x)\n        elif key == \"atomic_inputs\":\n            x = np.array(x, dtype=np.float32)\n            x[:, -3:] = self.convert_distance(x[:, -3:])\n            return x\n        else:\n            return x\n\n    def is_preprocessed(self) -> bool:\n        \"\"\"\n        Check if the dataset is preprocessed and available online or locally.\n\n        Returns:\n            True if the dataset is available remotely or locally, False otherwise.\n        \"\"\"\n        predicats = [\n            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def is_cached(self) -> bool:\n        \"\"\"\n        Check if the dataset is cached locally.\n\n        Returns:\n            True if the dataset is cached locally, False otherwise.\n        \"\"\"\n        predicats = [\n            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n        \"\"\"\n        Preprocess the dataset and save it.\n\n        Parameters:\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                hether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n            as_zarr:\n                Whether to save the data as zarr files\n        \"\"\"\n        if overwrite or not self.is_preprocessed():\n            entries = self.read_raw_entries()\n            res = self.collate_list(entries)\n            self.save_preprocess(res, upload, overwrite, as_zarr)\n\n    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n        \"\"\"\n        Upload the preprocessed data to the remote storage. Must be called after preprocess and\n        need to have write privileges.\n\n        Parameters:\n            overwrite:\n                Whether to overwrite the remote data if it already exists\n            as_zarr:\n                Whether to upload the data as zarr files\n        \"\"\"\n        for key in self.data_keys:\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n            push_remote(local_path, overwrite=overwrite)\n        local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n        push_remote(local_path, overwrite=overwrite)\n\n    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n        \"\"\"\n        Save a single entry at index idx as an extxyz file.\n\n        Parameters:\n            idx:\n                Index of the entry\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file. If None, the current working directory is used.\n            ext:\n                Whether to include additional informations like forces and other metadatas (extxyz format)\n        \"\"\"\n        if path is None:\n            path = os.getcwd()\n        at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n        write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n\n    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n        \"\"\"\n        Save dataset as single xyz file (extended xyz format).\n\n        Parameters:\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file\n        \"\"\"\n        with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n            for atoms in tqdm(\n                self.as_iter(atoms=True, energy_method=energy_method),\n                total=len(self),\n                desc=f\"Saving {self.__name__} as xyz file\",\n            ):\n                write_extxyz(f, atoms, append=True)\n\n    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n        \"\"\"\n        Get the ASE atoms object for the entry at index idx.\n\n        Parameters:\n            idx:\n                Index of the entry.\n            energy_method:\n                Index of the energy method to use\n            ext:\n                Whether to include additional informations\n\n        Returns:\n            ASE atoms object\n        \"\"\"\n        entry = self[idx]\n        at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n        return at\n\n    def subsample(\n        self, n_samples: Optional[Union[List[int], int, float]] = None, replace: bool = False, seed: int = 42\n    ):\n        np.random.seed(seed)\n        if n_samples is None:\n            return list(range(len(self)))\n        try:\n            if 0 < n_samples < 1:\n                n_samples = int(n_samples * len(self))\n            if isinstance(n_samples, int):\n                idxs = np.random.choice(len(self), size=n_samples, replace=replace)\n        except (ValueError, TypeError):  # list, set, np.ndarray\n            idxs = n_samples\n        return idxs\n\n    @requires_package(\"datamol\")\n    def calculate_descriptors(\n        self,\n        descriptor_name: str = \"soap\",\n        chemical_species: Optional[List[str]] = None,\n        n_samples: Optional[Union[List[int], int, float]] = None,\n        progress: bool = True,\n        **descriptor_kwargs,\n    ) -> Dict[str, np.ndarray]:\n        \"\"\"\n        Compute the descriptors for the dataset.\n\n        Parameters:\n            descriptor_name:\n                Name of the descriptor to use. Supported descriptors are [\"soap\"]\n            chemical_species:\n                List of chemical species to use for the descriptor computation, by default None.\n                If None, the chemical species of the dataset are used.\n            n_samples:\n                Number of samples to use for the computation, by default None.\n                If None, all the dataset is used.\n                If a list of integers is provided, the descriptors are computed for\n                each of the specified idx of samples.\n            progress:\n                Whether to show a progress bar, by default True.\n            **descriptor_kwargs : dict\n                Keyword arguments to pass to the descriptor instantiation of the model.\n\n        Returns:\n            Dictionary containing the following keys:\n                - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n                - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n        \"\"\"\n        import datamol as dm\n\n        datum = {}\n        idxs = self.subsample(n_samples)\n        model = get_descriptor(descriptor_name.lower())(\n            species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n        )\n\n        def wrapper(idx):\n            entry = self.get_ase_atoms(idx, ext=False)\n            return model.calculate(entry)\n\n        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n        datum[\"values\"] = np.vstack(descr)\n        datum[\"idxs\"] = idxs\n        return datum\n\n    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n        \"\"\"\n        Return the dataset as an iterator.\n\n        Parameters:\n            atoms:\n                Whether to return the items as ASE atoms object, by default False\n            energy_method:\n                Index of the energy method to use\n\n        Returns:\n            Iterator of the dataset\n        \"\"\"\n\n        func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n        for i in range(len(self)):\n            yield func(i)\n\n    def __iter__(self):\n        for idxs in range(len(self)):\n            yield self[idxs]\n\n    def get_statistics(self, return_none: bool = True) -> Dict:\n        \"\"\"\n        Get the converted statistics of the dataset.\n\n        Parameters:\n            return_none :\n                Whether to return None if the statistics for the forces are not available, by default True\n                Otherwise, the statistics for the forces are set to 0.0\n\n        Returns:\n            Dictionary containing the statistics of the dataset\n        \"\"\"\n        selected_stats = self.statistics.get_results()\n        if len(selected_stats) == 0:\n            raise StatisticsNotAvailableError(self.__name__)\n        if not return_none:\n            selected_stats.update(\n                {\n                    \"ForcesCalculatorStats\": {\n                        \"mean\": np.array([0.0]),\n                        \"std\": np.array([0.0]),\n                        \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                    }\n                }\n            )\n        # cycle trough dict to convert units\n        for key, result in selected_stats.items():\n            if isinstance(result, ForcesCalculatorStats):\n                result.transform(self.convert_forces)\n            else:\n                result.transform(self.convert_energy)\n            result.transform(self._convert_array)\n        return {k: result.to_dict() for k, result in selected_stats.items()}\n\n    def __str__(self):\n        return f\"{self.__name__}\"\n\n    def __repr__(self):\n        return f\"{self.__name__}\"\n\n    def __len__(self):\n        return self.data[\"energies\"].shape[0]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n\n    def _convert_array(self, x: np.ndarray):\n        return _CONVERT_DICT.get(self.array_format)(x)\n\n    def __getitem__(self, idx: int):\n        shift = MAX_CHARGE\n        p_start, p_end = self.data[\"position_idx_range\"][idx]\n        input = self.data[\"atomic_inputs\"][p_start:p_end]\n        z, c, positions, energies = (\n            self._convert_array(np.array(input[:, 0], dtype=np.int32)),\n            self._convert_array(np.array(input[:, 1], dtype=np.int32)),\n            self._convert_array(np.array(input[:, -3:], dtype=np.float32)),\n            self._convert_array(np.array(self.data[\"energies\"][idx], dtype=np.float64)),\n        )\n        name = self.__smiles_converter__(self.data[\"name\"][idx])\n        subset = self.data[\"subset\"][idx]\n        e0s = self._convert_array(self.__isolated_atom_energies__[..., z, c + shift].T)\n        formation_energies = energies - e0s.sum(axis=0)\n        forces = None\n        if \"forces\" in self.data:\n            forces = self._convert_array(np.array(self.data[\"forces\"][p_start:p_end], dtype=np.float32))\n\n        bunch = Bunch(\n            positions=positions,\n            atomic_numbers=z,\n            charges=c,\n            e0=e0s,\n            energies=energies,\n            formation_energies=formation_energies,\n            per_atom_formation_energies=formation_energies / len(z),\n            name=name,\n            subset=subset,\n            forces=forces,\n        )\n\n        if self.transform is not None:\n            bunch = self.transform(bunch)\n\n        return bunch\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__force_methods__","title":"__force_methods__ property","text":"

    For backward compatibility. To be removed in the future.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.e0s_dispatcher","title":"e0s_dispatcher: AtomEnergies property","text":"

    Property to get the object that dispatched the isolated atom energies of the QM methods.

    Returns:

    Type Description AtomEnergies

    Object wrapping the isolated atom energies of the QM methods.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.energy_methods","title":"energy_methods: List[str] property","text":"

    Return the string version of the energy methods

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__init__","title":"__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})","text":"

    Parameters:

    Name Type Description Default energy_unit Optional[str]

    Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    None distance_unit Optional[str]

    Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]

    None array_format str

    Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]

    'numpy' energy_type Optional[str]

    Type of isolated atom energy to use for the dataset. Default: \"formation\" Supported types: [\"formation\", \"regression\", \"null\", None]

    'formation' overwrite_local_cache bool

    Whether to overwrite the locally cached dataset.

    False cache_dir Optional[str]

    Cache directory location. Defaults to \"~/.cache/openqdc\"

    None recompute_statistics bool

    Whether to recompute the statistics of the dataset.

    False transform Optional[Callable]

    transformation to apply to the getitem calls

    None regressor_kwargs Dict

    Dictionary of keyword arguments to pass to the regressor. Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1} solver_type can be one of [\"linear\", \"ridge\"]

    {'solver_type': 'linear', 'sub_sample': None, 'stride': 1} Source code in openqdc/datasets/base.py
    def __init__(\n    self,\n    energy_unit: Optional[str] = None,\n    distance_unit: Optional[str] = None,\n    array_format: str = \"numpy\",\n    energy_type: Optional[str] = \"formation\",\n    overwrite_local_cache: bool = False,\n    cache_dir: Optional[str] = None,\n    recompute_statistics: bool = False,\n    transform: Optional[Callable] = None,\n    skip_statistics: bool = False,\n    read_as_zarr: bool = False,\n    regressor_kwargs: Dict = {\n        \"solver_type\": \"linear\",\n        \"sub_sample\": None,\n        \"stride\": 1,\n    },\n) -> None:\n    \"\"\"\n\n    Parameters:\n        energy_unit:\n            Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n        distance_unit:\n            Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n        array_format:\n            Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n        energy_type:\n            Type of isolated atom energy to use for the dataset. Default: \"formation\"\n            Supported types: [\"formation\", \"regression\", \"null\", None]\n        overwrite_local_cache:\n            Whether to overwrite the locally cached dataset.\n        cache_dir:\n            Cache directory location. Defaults to \"~/.cache/openqdc\"\n        recompute_statistics:\n            Whether to recompute the statistics of the dataset.\n        transform:\n            transformation to apply to the __getitem__ calls\n        regressor_kwargs:\n            Dictionary of keyword arguments to pass to the regressor.\n            Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n            solver_type can be one of [\"linear\", \"ridge\"]\n    \"\"\"\n    set_cache_dir(cache_dir)\n    # self._init_lambda_fn()\n    self.data = None\n    self._original_unit = self.energy_unit\n    self.recompute_statistics = recompute_statistics\n    self.regressor_kwargs = regressor_kwargs\n    self.transform = transform\n    self.read_as_zarr = read_as_zarr\n    self.energy_type = energy_type if energy_type is not None else \"null\"\n    self.refit_e0s = recompute_statistics or overwrite_local_cache\n    self.skip_statistics = skip_statistics\n    if not self.is_preprocessed():\n        raise DatasetNotAvailableError(self.__name__)\n    else:\n        self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n    self.set_array_format(array_format)\n    self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/base.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.as_iter","title":"as_iter(atoms=False, energy_method=0)","text":"

    Return the dataset as an iterator.

    Parameters:

    Name Type Description Default atoms bool

    Whether to return the items as ASE atoms object, by default False

    False energy_method int

    Index of the energy method to use

    0

    Returns:

    Type Description Iterable

    Iterator of the dataset

    Source code in openqdc/datasets/base.py
    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n    \"\"\"\n    Return the dataset as an iterator.\n\n    Parameters:\n        atoms:\n            Whether to return the items as ASE atoms object, by default False\n        energy_method:\n            Index of the energy method to use\n\n    Returns:\n        Iterator of the dataset\n    \"\"\"\n\n    func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n    for i in range(len(self)):\n        yield func(i)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.calculate_descriptors","title":"calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)","text":"

    Compute the descriptors for the dataset.

    Parameters:

    Name Type Description Default descriptor_name str

    Name of the descriptor to use. Supported descriptors are [\"soap\"]

    'soap' chemical_species Optional[List[str]]

    List of chemical species to use for the descriptor computation, by default None. If None, the chemical species of the dataset are used.

    None n_samples Optional[Union[List[int], int, float]]

    Number of samples to use for the computation, by default None. If None, all the dataset is used. If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.

    None progress bool

    Whether to show a progress bar, by default True.

    True **descriptor_kwargs

    dict Keyword arguments to pass to the descriptor instantiation of the model.

    {}

    Returns:

    Type Description Dict[str, ndarray]

    Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used

    Source code in openqdc/datasets/base.py
    @requires_package(\"datamol\")\ndef calculate_descriptors(\n    self,\n    descriptor_name: str = \"soap\",\n    chemical_species: Optional[List[str]] = None,\n    n_samples: Optional[Union[List[int], int, float]] = None,\n    progress: bool = True,\n    **descriptor_kwargs,\n) -> Dict[str, np.ndarray]:\n    \"\"\"\n    Compute the descriptors for the dataset.\n\n    Parameters:\n        descriptor_name:\n            Name of the descriptor to use. Supported descriptors are [\"soap\"]\n        chemical_species:\n            List of chemical species to use for the descriptor computation, by default None.\n            If None, the chemical species of the dataset are used.\n        n_samples:\n            Number of samples to use for the computation, by default None.\n            If None, all the dataset is used.\n            If a list of integers is provided, the descriptors are computed for\n            each of the specified idx of samples.\n        progress:\n            Whether to show a progress bar, by default True.\n        **descriptor_kwargs : dict\n            Keyword arguments to pass to the descriptor instantiation of the model.\n\n    Returns:\n        Dictionary containing the following keys:\n            - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n            - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n    \"\"\"\n    import datamol as dm\n\n    datum = {}\n    idxs = self.subsample(n_samples)\n    model = get_descriptor(descriptor_name.lower())(\n        species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n    )\n\n    def wrapper(idx):\n        entry = self.get_ase_atoms(idx, ext=False)\n        return model.calculate(entry)\n\n    descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n    datum[\"values\"] = np.vstack(descr)\n    datum[\"idxs\"] = idxs\n    return datum\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.collate_list","title":"collate_list(list_entries)","text":"

    Collate a list of entries into a single dictionary.

    Parameters:

    Name Type Description Default list_entries List[Dict]

    List of dictionaries containing the entries to collate.

    required

    Returns:

    Type Description Dict

    Dictionary containing the collated entries.

    Source code in openqdc/datasets/base.py
    def collate_list(self, list_entries: List[Dict]) -> Dict:\n    \"\"\"\n    Collate a list of entries into a single dictionary.\n\n    Parameters:\n        list_entries:\n            List of dictionaries containing the entries to collate.\n\n    Returns:\n        Dictionary containing the collated entries.\n    \"\"\"\n    # concatenate entries\n    res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n    csum = np.cumsum(res.get(\"n_atoms\"))\n    x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n    x[1:, 0], x[:, 1] = csum[:-1], csum\n    res[\"position_idx_range\"] = x\n\n    return res\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_ase_atoms","title":"get_ase_atoms(idx, energy_method=0, ext=True)","text":"

    Get the ASE atoms object for the entry at index idx.

    Parameters:

    Name Type Description Default idx int

    Index of the entry.

    required energy_method int

    Index of the energy method to use

    0 ext bool

    Whether to include additional informations

    True

    Returns:

    Type Description Atoms

    ASE atoms object

    Source code in openqdc/datasets/base.py
    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n    \"\"\"\n    Get the ASE atoms object for the entry at index idx.\n\n    Parameters:\n        idx:\n            Index of the entry.\n        energy_method:\n            Index of the energy method to use\n        ext:\n            Whether to include additional informations\n\n    Returns:\n        ASE atoms object\n    \"\"\"\n    entry = self[idx]\n    at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n    return at\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_statistics","title":"get_statistics(return_none=True)","text":"

    Get the converted statistics of the dataset.

    Parameters:

    Name Type Description Default return_none

    Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0

    True

    Returns:

    Type Description Dict

    Dictionary containing the statistics of the dataset

    Source code in openqdc/datasets/base.py
    def get_statistics(self, return_none: bool = True) -> Dict:\n    \"\"\"\n    Get the converted statistics of the dataset.\n\n    Parameters:\n        return_none :\n            Whether to return None if the statistics for the forces are not available, by default True\n            Otherwise, the statistics for the forces are set to 0.0\n\n    Returns:\n        Dictionary containing the statistics of the dataset\n    \"\"\"\n    selected_stats = self.statistics.get_results()\n    if len(selected_stats) == 0:\n        raise StatisticsNotAvailableError(self.__name__)\n    if not return_none:\n        selected_stats.update(\n            {\n                \"ForcesCalculatorStats\": {\n                    \"mean\": np.array([0.0]),\n                    \"std\": np.array([0.0]),\n                    \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                }\n            }\n        )\n    # cycle trough dict to convert units\n    for key, result in selected_stats.items():\n        if isinstance(result, ForcesCalculatorStats):\n            result.transform(self.convert_forces)\n        else:\n            result.transform(self.convert_energy)\n        result.transform(self._convert_array)\n    return {k: result.to_dict() for k, result in selected_stats.items()}\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_cached","title":"is_cached()","text":"

    Check if the dataset is cached locally.

    Returns:

    Type Description bool

    True if the dataset is cached locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_cached(self) -> bool:\n    \"\"\"\n    Check if the dataset is cached locally.\n\n    Returns:\n        True if the dataset is cached locally, False otherwise.\n    \"\"\"\n    predicats = [\n        os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_preprocessed","title":"is_preprocessed()","text":"

    Check if the dataset is preprocessed and available online or locally.

    Returns:

    Type Description bool

    True if the dataset is available remotely or locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_preprocessed(self) -> bool:\n    \"\"\"\n    Check if the dataset is preprocessed and available online or locally.\n\n    Returns:\n        True if the dataset is available remotely or locally, False otherwise.\n    \"\"\"\n    predicats = [\n        copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.no_init","title":"no_init() classmethod","text":"

    Class method to avoid the init method to be called when the class is instanciated. Useful for debugging purposes or preprocessing data.

    Source code in openqdc/datasets/base.py
    @classmethod\ndef no_init(cls):\n    \"\"\"\n    Class method to avoid the __init__ method to be called when the class is instanciated.\n    Useful for debugging purposes or preprocessing data.\n    \"\"\"\n    return cls.__new__(cls)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.preprocess","title":"preprocess(upload=False, overwrite=True, as_zarr=True)","text":"

    Preprocess the dataset and save it.

    Parameters:

    Name Type Description Default upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    hether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True as_zarr bool

    Whether to save the data as zarr files

    True Source code in openqdc/datasets/base.py
    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n    \"\"\"\n    Preprocess the dataset and save it.\n\n    Parameters:\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            hether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n        as_zarr:\n            Whether to save the data as zarr files\n    \"\"\"\n    if overwrite or not self.is_preprocessed():\n        entries = self.read_raw_entries()\n        res = self.collate_list(entries)\n        self.save_preprocess(res, upload, overwrite, as_zarr)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.read_raw_entries","title":"read_raw_entries()","text":"

    Preprocess the raw (aka from the fetched source) into a list of dictionaries.

    Source code in openqdc/datasets/base.py
    def read_raw_entries(self):\n    \"\"\"\n    Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_preprocess","title":"save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.

    Parameters:

    Name Type Description Default data_dict Dict[str, ndarray]

    Dictionary containing the preprocessed data.

    required upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    Whether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True Source code in openqdc/datasets/base.py
    def save_preprocess(\n    self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n):\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n    Parameters:\n        data_dict:\n            Dictionary containing the preprocessed data.\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            Whether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n    \"\"\"\n    # save memmaps\n    logger.info(\"Preprocessing data and saving it to cache.\")\n    paths = self.dataset_wrapper.save_preprocess(\n        self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n    )\n    if upload:\n        for local_path in paths:\n            push_remote(local_path, overwrite=overwrite)  # make it async?\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_xyz","title":"save_xyz(idx, energy_method=0, path=None, ext=True)","text":"

    Save a single entry at index idx as an extxyz file.

    Parameters:

    Name Type Description Default idx int

    Index of the entry

    required energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file. If None, the current working directory is used.

    None ext bool

    Whether to include additional informations like forces and other metadatas (extxyz format)

    True Source code in openqdc/datasets/base.py
    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n    \"\"\"\n    Save a single entry at index idx as an extxyz file.\n\n    Parameters:\n        idx:\n            Index of the entry\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file. If None, the current working directory is used.\n        ext:\n            Whether to include additional informations like forces and other metadatas (extxyz format)\n    \"\"\"\n    if path is None:\n        path = os.getcwd()\n    at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n    write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_distance_unit","title":"set_distance_unit(value)","text":"

    Set a new distance unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New distance unit to set.

    required Source code in openqdc/datasets/base.py
    def set_distance_unit(self, value: str):\n    \"\"\"\n    Set a new distance unit for the dataset.\n\n    Parameters:\n        value:\n            New distance unit to set.\n    \"\"\"\n    # old_unit = self.distance_unit\n    # self.__distance_unit__ = value\n    self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n    self.__distance_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_energy_unit","title":"set_energy_unit(value)","text":"

    Set a new energy unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New energy unit to set.

    required Source code in openqdc/datasets/base.py
    def set_energy_unit(self, value: str):\n    \"\"\"\n    Set a new energy unit for the dataset.\n\n    Parameters:\n        value:\n            New energy unit to set.\n    \"\"\"\n    # old_unit = self.energy_unit\n    # self.__energy_unit__ = value\n    self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n    self.__energy_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.to_xyz","title":"to_xyz(energy_method=0, path=None)","text":"

    Save dataset as single xyz file (extended xyz format).

    Parameters:

    Name Type Description Default energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file

    None Source code in openqdc/datasets/base.py
    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n    \"\"\"\n    Save dataset as single xyz file (extended xyz format).\n\n    Parameters:\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file\n    \"\"\"\n    with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n        for atoms in tqdm(\n            self.as_iter(atoms=True, energy_method=energy_method),\n            total=len(self),\n            desc=f\"Saving {self.__name__} as xyz file\",\n        ):\n            write_extxyz(f, atoms, append=True)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.upload","title":"upload(overwrite=False, as_zarr=False)","text":"

    Upload the preprocessed data to the remote storage. Must be called after preprocess and need to have write privileges.

    Parameters:

    Name Type Description Default overwrite bool

    Whether to overwrite the remote data if it already exists

    False as_zarr bool

    Whether to upload the data as zarr files

    False Source code in openqdc/datasets/base.py
    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n    \"\"\"\n    Upload the preprocessed data to the remote storage. Must be called after preprocess and\n    need to have write privileges.\n\n    Parameters:\n        overwrite:\n            Whether to overwrite the remote data if it already exists\n        as_zarr:\n            Whether to upload the data as zarr files\n    \"\"\"\n    for key in self.data_keys:\n        local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n        push_remote(local_path, overwrite=overwrite)\n    local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n    push_remote(local_path, overwrite=overwrite)\n
    "},{"location":"API/e0_dispatcher.html","title":"e0 Dispatcher","text":""},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies","title":"AtomEnergies","text":"

    Manager class for interface with the isolated atom energies classes and providing the generals function to retrieve the data

    Source code in openqdc/datasets/energies.py
    class AtomEnergies:\n    \"\"\"\n    Manager class for interface with the isolated atom energies classes\n    and providing the generals function to retrieve the data\n    \"\"\"\n\n    def __init__(self, data, **kwargs) -> None:\n        self.atom_energies = data.energy_type\n        self.factory = dispatch_factory(data, **kwargs)\n\n    @property\n    def e0s_matrix(self) -> np.ndarray:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_matrix\n\n    @property\n    def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_dict\n\n    def __str__(self):\n        return f\"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}\"\n\n    def __repr__(self):\n        return str(self)\n\n    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:\n        \"\"\"\n        Retrieve a key from the isolated atom dictionary.\n        Item can be written as tuple(Symbol, charge),\n        tuple(Chemical number, charge). If no charge is passed,\n        it will be automatically set to 0.\n\n        Examples:\n            AtomEnergies[6], AtomEnergies[6,1], \\n\n            AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n            AtomEnergies[(\"C,1)]\n\n        Parameters:\n            item:\n                AtomSpecies object or tuple with the atom symbol and charge\n\n        Returns:\n            AtomEnergy object with the isolated atom energy\n        \"\"\"\n        try:\n            atom, charge = item[0], item[1]\n        except TypeError:\n            atom = item\n            charge = 0\n        except IndexError:\n            atom = item[0]\n            charge = 0\n        if not isinstance(atom, str):\n            atom = ATOM_SYMBOLS[atom]\n        return self.e0s_dict[(atom, charge)]\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_dict","title":"e0s_dict: Dict[AtomSpecies, AtomEnergy] property","text":"

    Return the isolated atom energies dictionary

    Returns:

    Type Description Dict[AtomSpecies, AtomEnergy]

    Dictionary with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_matrix","title":"e0s_matrix: np.ndarray property","text":"

    Return the isolated atom energies dictionary

    Returns:

    Type Description ndarray

    Matrix Array with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.__getitem__","title":"__getitem__(item)","text":"

    Retrieve a key from the isolated atom dictionary. Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0.

    Examples:

    AtomEnergies[6], AtomEnergies[6,1],

    AtomEnergies[\"C\",1], AtomEnergies[(6,1)],

    AtomEnergies[(\"C,1)]

    Parameters:

    Name Type Description Default item AtomSpecies

    AtomSpecies object or tuple with the atom symbol and charge

    required

    Returns:

    Type Description AtomEnergy

    AtomEnergy object with the isolated atom energy

    Source code in openqdc/datasets/energies.py
    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:\n    \"\"\"\n    Retrieve a key from the isolated atom dictionary.\n    Item can be written as tuple(Symbol, charge),\n    tuple(Chemical number, charge). If no charge is passed,\n    it will be automatically set to 0.\n\n    Examples:\n        AtomEnergies[6], AtomEnergies[6,1], \\n\n        AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n        AtomEnergies[(\"C,1)]\n\n    Parameters:\n        item:\n            AtomSpecies object or tuple with the atom symbol and charge\n\n    Returns:\n        AtomEnergy object with the isolated atom energy\n    \"\"\"\n    try:\n        atom, charge = item[0], item[1]\n    except TypeError:\n        atom = item\n        charge = 0\n    except IndexError:\n        atom = item[0]\n        charge = 0\n    if not isinstance(atom, str):\n        atom = ATOM_SYMBOLS[atom]\n    return self.e0s_dict[(atom, charge)]\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy","title":"AtomEnergy dataclass","text":"

    Datastructure to store isolated atom energies and the std deviation associated to the value. By default the std will be 1 if no value was calculated or not available (formation energy case)

    Source code in openqdc/datasets/energies.py
    @dataclass\nclass AtomEnergy:\n    \"\"\"\n    Datastructure to store isolated atom energies\n    and the std deviation associated to the value.\n    By default the std will be 1 if no value was calculated\n    or not available (formation energy case)\n    \"\"\"\n\n    mean: np.array\n    std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32))\n\n    def __post_init__(self):\n        if not isinstance(self.mean, np.ndarray):\n            self.mean = np.array([self.mean], dtype=np.float32)\n\n    def append(self, other: \"AtomEnergy\"):\n        \"\"\"\n        Append the mean and std of another atom energy\n        \"\"\"\n        self.mean = np.append(self.mean, other.mean)\n        self.std = np.append(self.std, other.std)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy.append","title":"append(other)","text":"

    Append the mean and std of another atom energy

    Source code in openqdc/datasets/energies.py
    def append(self, other: \"AtomEnergy\"):\n    \"\"\"\n    Append the mean and std of another atom energy\n    \"\"\"\n    self.mean = np.append(self.mean, other.mean)\n    self.std = np.append(self.std, other.std)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomSpecies","title":"AtomSpecies dataclass","text":"

    Structure that defines a tuple of chemical specie and charge and provide hash and automatic conversion from atom number to checmical symbol

    Source code in openqdc/datasets/energies.py
    @dataclass(frozen=False, eq=True)\nclass AtomSpecies:\n    \"\"\"\n    Structure that defines a tuple of chemical specie and charge\n    and provide hash and automatic conversion from atom number to\n    checmical symbol\n    \"\"\"\n\n    symbol: Union[str, int]\n    charge: int = 0\n\n    def __post_init__(self):\n        if not isinstance(self.symbol, str):\n            self.symbol = ATOM_SYMBOLS[self.symbol]\n        self.number = ATOMIC_NUMBERS[self.symbol]\n\n    def __hash__(self):\n        return hash((self.symbol, self.charge))\n\n    def __eq__(self, other):\n        if not isinstance(other, AtomSpecies):\n            symbol, charge = other[0], other[1]\n            other = AtomSpecies(symbol=symbol, charge=charge)\n        return (self.number, self.charge) == (other.number, other.charge)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface","title":"IsolatedEnergyInterface","text":"

    Bases: ABC

    Abstract class that defines the interface for the different implementation of an isolated atom energy value

    Source code in openqdc/datasets/energies.py
    class IsolatedEnergyInterface(ABC):\n    \"\"\"\n    Abstract class that defines the interface for the\n    different implementation of an isolated atom energy value\n    \"\"\"\n\n    def __init__(self, data, **kwargs):\n        \"\"\"\n        Parameters:\n            data : openqdc.datasets.Dataset\n                Dataset object that contains the information\n                about the isolated atom energies. Info will be passed\n                by references\n            kwargs : dict\n                Additional arguments that will be passed to the\n                selected energy class. Mostly used for regression\n                to pass the regressor_kwargs.\n        \"\"\"\n        self._e0_matrixs = []\n        self._e0_dict = None\n        self.kwargs = kwargs\n        self.data = data\n        self._post_init()\n\n    @property\n    def refit(self) -> bool:\n        return self.data.refit_e0s\n\n    @abstractmethod\n    def _post_init(self):\n        \"\"\"\n        Main method to fetch/compute/recomputed the isolated atom energies.\n        Need to be implemented in all child classes.\n        \"\"\"\n        pass\n\n    def __len__(self):\n        return len(self.data.energy_methods)\n\n    @property\n    def e0_matrix(self) -> np.ndarray:\n        \"\"\"\n        Return the isolated atom energies matrixes\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return np.array(self._e0_matrixs)\n\n    @property\n    def e0_dict(self) -> Dict:\n        \"\"\"\n        Return the isolated atom energies dict\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n\n        return self._e0s_dict\n\n    def __str__(self) -> str:\n        return self.__class__.__name__.lower()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_dict","title":"e0_dict: Dict property","text":"

    Return the isolated atom energies dict

    Returns:

    Type Description Dict

    Dictionary with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_matrix","title":"e0_matrix: np.ndarray property","text":"

    Return the isolated atom energies matrixes

    Returns:

    Type Description ndarray

    Matrix Array with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.__init__","title":"__init__(data, **kwargs)","text":"

    Parameters:

    Name Type Description Default data

    openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references

    required kwargs

    dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.

    {} Source code in openqdc/datasets/energies.py
    def __init__(self, data, **kwargs):\n    \"\"\"\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n    \"\"\"\n    self._e0_matrixs = []\n    self._e0_dict = None\n    self.kwargs = kwargs\n    self.data = data\n    self._post_init()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.NullEnergy","title":"NullEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that returns a null (zeros) matrix for the isolated atom energies in case of no energies are available.

    Source code in openqdc/datasets/energies.py
    class NullEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a null (zeros) matrix for the isolated atom energies in case\n    of no energies are available.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for _ in self.data.__energy_methods__:\n            for key, values in PotentialMethod.NONE.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))]\n        self._assembly_e0_dict()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.PhysicalEnergy","title":"PhysicalEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that returns a physical (SE,DFT,etc) isolated atom energies.

    Source code in openqdc/datasets/energies.py
    class PhysicalEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a physical (SE,DFT,etc) isolated atom energies.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for method in self.data.__energy_methods__:\n            for key, values in method.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__]\n        self._assembly_e0_dict()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy","title":"RegressionEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that compute and returns the regressed isolated atom energies.

    Source code in openqdc/datasets/energies.py
    class RegressionEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that compute and returns the regressed isolated atom energies.\n    \"\"\"\n\n    def _post_init(self):\n        if not self.attempt_load() or self.refit:\n            self.regressor = Regressor.from_openqdc_dataset(self.data, **self.kwargs)\n            E0s, cov = self._compute_regression_e0s()\n            self._set_lin_atom_species_dict(E0s, cov)\n        self._set_linear_e0s()\n\n    def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Try to compute the regressed isolated atom energies.\n        raise an error if the regression fails.\n        return the regressed isolated atom energies and the uncertainty values.\n\n        Returns:\n            Tuple with the regressed isolated atom energies and the uncertainty values of the regression\n            if available.\n        \"\"\"\n        try:\n            E0s, cov = self.regressor.solve()\n        except np.linalg.LinAlgError:\n            logger.warning(f\"Failed to compute E0s using {self.regressor.solver_type} regression.\")\n            raise np.linalg.LinAlgError\n        return E0s, cov\n\n    def _set_lin_atom_species_dict(self, E0s, covs) -> None:\n        \"\"\"\n        Set the regressed isolated atom energies in a dictionary format\n        and Save the values in a pickle file to easy loading.\n        \"\"\"\n        atomic_energies_dict = {}\n        for i, z in enumerate(self.regressor.numbers):\n            for charge in range(-10, 11):\n                atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i])\n            # atomic_energies_dict[z] = E0s[i]\n        self._e0s_dict = atomic_energies_dict\n        self.save_e0s()\n\n    def _set_linear_e0s(self) -> None:\n        \"\"\"\n        Transform the e0s dictionary into the correct e0s\n        matrix format.\n        \"\"\"\n        new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]\n        for z, e0 in self._e0s_dict.items():\n            for i in range(len(self)):\n                # new_e0s[i][z, :] = e0[i]\n                new_e0s[i][z.number, z.charge] = e0.mean[i]\n            # for atom_sp, values in\n        self._e0_matrixs = new_e0s\n\n    def save_e0s(self) -> None:\n        \"\"\"\n        Save the regressed isolated atom energies in a pickle file.\n        \"\"\"\n        save_pkl(self._e0s_dict, self.preprocess_path)\n\n    def attempt_load(self) -> bool:\n        \"\"\"\n        Try to load the regressed isolated atom energies from the\n        object pickle file and return the success of the operation.\n        \"\"\"\n        try:\n            self._e0s_dict = load_pkl(self.preprocess_path)\n            logger.info(f\"Found energy file for {str(self)}.\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Energy file for {str(self)} not found.\")\n            return False\n\n    @property\n    def preprocess_path(self):\n        \"\"\"\n        Return the path to the object pickle file.\n        \"\"\"\n        path = p_join(self.data.root, \"preprocessed\", str(self) + \".pkl\")\n        return path\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.preprocess_path","title":"preprocess_path property","text":"

    Return the path to the object pickle file.

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.attempt_load","title":"attempt_load()","text":"

    Try to load the regressed isolated atom energies from the object pickle file and return the success of the operation.

    Source code in openqdc/datasets/energies.py
    def attempt_load(self) -> bool:\n    \"\"\"\n    Try to load the regressed isolated atom energies from the\n    object pickle file and return the success of the operation.\n    \"\"\"\n    try:\n        self._e0s_dict = load_pkl(self.preprocess_path)\n        logger.info(f\"Found energy file for {str(self)}.\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Energy file for {str(self)} not found.\")\n        return False\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.save_e0s","title":"save_e0s()","text":"

    Save the regressed isolated atom energies in a pickle file.

    Source code in openqdc/datasets/energies.py
    def save_e0s(self) -> None:\n    \"\"\"\n    Save the regressed isolated atom energies in a pickle file.\n    \"\"\"\n    save_pkl(self._e0s_dict, self.preprocess_path)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.dispatch_factory","title":"dispatch_factory(data, **kwargs)","text":"

    Factory function that select the correct energy class for the fetching/calculation of isolated atom energies.

    Parameters:

    Name Type Description Default data

    openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references

    required kwargs

    dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.

    {}

    Returns:

    Type Description IsolatedEnergyInterface

    Initialized IsolatedEnergyInterface-like object

    Source code in openqdc/datasets/energies.py
    def dispatch_factory(data: Any, **kwargs: Dict) -> \"IsolatedEnergyInterface\":\n    \"\"\"\n    Factory function that select the correct\n    energy class for the fetching/calculation\n    of isolated atom energies.\n\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n\n    Returns:\n        Initialized IsolatedEnergyInterface-like object\n    \"\"\"\n    if data.energy_type == \"formation\":\n        return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"regression\":\n        try:\n            return RegressionEnergy(data, **kwargs)\n        except np.linalg.LinAlgError:\n            logger.warning(\"Error! Using physical energies instead.\")\n            return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"null\":\n        return NullEnergy(data, **kwargs)\n
    "},{"location":"API/formats.html","title":"Format loading","text":""},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure","title":"GeneralStructure","text":"

    Bases: ABC

    Abstract Factory class for datasets type in the openQDC package.

    Source code in openqdc/datasets/structure.py
    class GeneralStructure(ABC):\n    \"\"\"\n    Abstract Factory class for datasets type in the openQDC package.\n    \"\"\"\n\n    _ext: Optional[str] = None\n    _extra_files: Optional[List[str]] = None\n\n    @property\n    def ext(self):\n        return self._ext\n\n    @property\n    @abstractmethod\n    def load_fn(self) -> Callable:\n        \"\"\"\n        Function to use for loading the data.\n        Must be implemented by the child class.\n\n        Returns:\n            the function to use for loading the data\n        \"\"\"\n        raise NotImplementedError\n\n    def add_extension(self, filename: str) -> str:\n        \"\"\"\n        Add the correct extension to a filename\n\n        Parameters:\n            filename:  the filename to add the extension to\n\n        Returns:\n            the filename with the extension\n        \"\"\"\n        return filename + self.ext\n\n    @abstractmethod\n    def save_preprocess(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_dict: Dict[str, np.ndarray],\n        extra_data_keys: List[str],\n        extra_data_types: Dict[str, type],\n    ) -> List[str]:\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n        Must be implemented by the child class.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_dict:        dictionary of data to save\n            extra_data_keys:  list of keys to load from the extra data file\n            extra_data_types: dictionary of data types for each key\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def load_extra_files(\n        self,\n        data: Dict[str, np.ndarray],\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        pkl_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Load extra files required to define other types of data.\n        Must be implemented by the child class.\n\n        Parameters:\n            data:  dictionary of data to load\n            preprocess_path:  path to the preprocessed data file\n            data_keys:    list of keys to load from the data file\n            pkl_data_keys:   list of keys to load from the extra files\n            overwrite:   whether to overwrite the local cache\n        \"\"\"\n        raise NotImplementedError\n\n    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n        \"\"\"\n        Join a path and a filename and add the correct extension.\n\n        Parameters:\n            path:  the path to join\n            filename:  the filename to join\n\n        Returns:\n            the joined path with the correct extension\n        \"\"\"\n        return p_join(path, self.add_extension(filename))\n\n    def load_data(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_types: Dict[str, np.dtype],\n        data_shapes: Dict[str, Tuple[int, int]],\n        extra_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Main method to load the data from a filetype structure like memmap or zarr.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_types:       dictionary of data types for each key\n            data_shapes:      dictionary of shapes for each key\n            extra_data_keys:  list of keys to load from the extra data file\n            overwrite:        whether to overwrite the local cache\n        \"\"\"\n        data = {}\n        for key in data_keys:\n            filename = self.join_and_ext(preprocess_path, key)\n            pull_locally(filename, overwrite=overwrite)\n            data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n            data[key] = self.unpack(data[key])\n            data[key] = data[key].reshape(*data_shapes[key])\n\n        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n        return data\n\n    def unpack(self, data: any) -> any:\n        \"\"\"\n        Unpack the data from the loaded file.\n\n        Parameters:\n            data:  the data to unpack\n\n        Returns:\n            the unpacked data\n        \"\"\"\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_fn","title":"load_fn: Callable abstractmethod property","text":"

    Function to use for loading the data. Must be implemented by the child class.

    Returns:

    Type Description Callable

    the function to use for loading the data

    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.add_extension","title":"add_extension(filename)","text":"

    Add the correct extension to a filename

    Parameters:

    Name Type Description Default filename str

    the filename to add the extension to

    required

    Returns:

    Type Description str

    the filename with the extension

    Source code in openqdc/datasets/structure.py
    def add_extension(self, filename: str) -> str:\n    \"\"\"\n    Add the correct extension to a filename\n\n    Parameters:\n        filename:  the filename to add the extension to\n\n    Returns:\n        the filename with the extension\n    \"\"\"\n    return filename + self.ext\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.join_and_ext","title":"join_and_ext(path, filename)","text":"

    Join a path and a filename and add the correct extension.

    Parameters:

    Name Type Description Default path Union[str, PathLike]

    the path to join

    required filename str

    the filename to join

    required

    Returns:

    Type Description Union[str, PathLike]

    the joined path with the correct extension

    Source code in openqdc/datasets/structure.py
    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n    \"\"\"\n    Join a path and a filename and add the correct extension.\n\n    Parameters:\n        path:  the path to join\n        filename:  the filename to join\n\n    Returns:\n        the joined path with the correct extension\n    \"\"\"\n    return p_join(path, self.add_extension(filename))\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_data","title":"load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)","text":"

    Main method to load the data from a filetype structure like memmap or zarr.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_types Dict[str, dtype]

    dictionary of data types for each key

    required data_shapes Dict[str, Tuple[int, int]]

    dictionary of shapes for each key

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    def load_data(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_types: Dict[str, np.dtype],\n    data_shapes: Dict[str, Tuple[int, int]],\n    extra_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Main method to load the data from a filetype structure like memmap or zarr.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_types:       dictionary of data types for each key\n        data_shapes:      dictionary of shapes for each key\n        extra_data_keys:  list of keys to load from the extra data file\n        overwrite:        whether to overwrite the local cache\n    \"\"\"\n    data = {}\n    for key in data_keys:\n        filename = self.join_and_ext(preprocess_path, key)\n        pull_locally(filename, overwrite=overwrite)\n        data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n        data[key] = self.unpack(data[key])\n        data[key] = data[key].reshape(*data_shapes[key])\n\n    data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_extra_files","title":"load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite) abstractmethod","text":"

    Load extra files required to define other types of data. Must be implemented by the child class.

    Parameters:

    Name Type Description Default data Dict[str, ndarray]

    dictionary of data to load

    required preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required pkl_data_keys List[str]

    list of keys to load from the extra files

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef load_extra_files(\n    self,\n    data: Dict[str, np.ndarray],\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    pkl_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Load extra files required to define other types of data.\n    Must be implemented by the child class.\n\n    Parameters:\n        data:  dictionary of data to load\n        preprocess_path:  path to the preprocessed data file\n        data_keys:    list of keys to load from the data file\n        pkl_data_keys:   list of keys to load from the extra files\n        overwrite:   whether to overwrite the local cache\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.save_preprocess","title":"save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) abstractmethod","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage. Must be implemented by the child class.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_dict Dict[str, ndarray]

    dictionary of data to save

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required extra_data_types Dict[str, type]

    dictionary of data types for each key

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef save_preprocess(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_dict: Dict[str, np.ndarray],\n    extra_data_keys: List[str],\n    extra_data_types: Dict[str, type],\n) -> List[str]:\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n    Must be implemented by the child class.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_dict:        dictionary of data to save\n        extra_data_keys:  list of keys to load from the extra data file\n        extra_data_types: dictionary of data types for each key\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.unpack","title":"unpack(data)","text":"

    Unpack the data from the loaded file.

    Parameters:

    Name Type Description Default data any

    the data to unpack

    required

    Returns:

    Type Description any

    the unpacked data

    Source code in openqdc/datasets/structure.py
    def unpack(self, data: any) -> any:\n    \"\"\"\n    Unpack the data from the loaded file.\n\n    Parameters:\n        data:  the data to unpack\n\n    Returns:\n        the unpacked data\n    \"\"\"\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.MemMapDataset","title":"MemMapDataset","text":"

    Bases: GeneralStructure

    Dataset structure for memory-mapped numpy arrays and props.pkl files.

    Source code in openqdc/datasets/structure.py
    class MemMapDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for memory-mapped numpy arrays and props.pkl files.\n    \"\"\"\n\n    _ext = \".mmap\"\n    _extra_files = [\"props.pkl\"]\n\n    @property\n    def load_fn(self):\n        return np.memmap\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        local_paths = []\n        for key in data_keys:\n            local_path = self.join_and_ext(preprocess_path, key)\n            out = np.memmap(local_path, mode=\"w+\", dtype=data_dict[key].dtype, shape=data_dict[key].shape)\n            out[:] = data_dict.pop(key)[:]\n            out.flush()\n            local_paths.append(local_path)\n\n        # save smiles and subset\n        local_path = p_join(preprocess_path, \"props.pkl\")\n\n        # assert that (required) pkl keys are present in data_dict\n        assert all([key in data_dict.keys() for key in extra_data_keys])\n\n        # store unique and inverse indices for str-based pkl keys\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        with open(local_path, \"wb\") as f:\n            pkl.dump(data_dict, f)\n\n        local_paths.append(local_path)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = p_join(preprocess_path, \"props.pkl\")\n        pull_locally(filename, overwrite=overwrite)\n        with open(filename, \"rb\") as f:\n            tmp = pkl.load(f)\n            all_pkl_keys = set(tmp.keys()) - set(data_keys)\n            # assert required pkl_keys are present in all_pkl_keys\n            assert all([key in all_pkl_keys for key in pkl_data_keys])\n            for key in all_pkl_keys:\n                x = tmp.pop(key)\n                if len(x) == 2:\n                    data[key] = x[0][x[1]]\n                else:\n                    data[key] = x\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.ZarrDataset","title":"ZarrDataset","text":"

    Bases: GeneralStructure

    Dataset structure for zarr files.

    Source code in openqdc/datasets/structure.py
    class ZarrDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for zarr files.\n    \"\"\"\n\n    _ext = \".zip\"\n    _extra_files = [\"metadata.zip\"]\n    _zarr_version = 2\n\n    @property\n    def load_fn(self):\n        return zarr.open\n\n    def unpack(self, data):\n        return data[:]\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        # os.makedirs(p_join(ds.root, \"zips\",  ds.__name__), exist_ok=True)\n        local_paths = []\n        for key, value in data_dict.items():\n            if key not in data_keys:\n                continue\n            zarr_path = self.join_and_ext(preprocess_path, key)\n            value = data_dict.pop(key)\n            z = zarr.open(\n                zarr.storage.ZipStore(zarr_path),\n                \"w\",\n                zarr_version=self._zarr_version,\n                shape=value.shape,\n                dtype=value.dtype,\n            )\n            z[:] = value[:]\n            local_paths.append(zarr_path)\n            # if key in attrs:\n            #    z.attrs.update(attrs[key])\n\n        metadata = p_join(preprocess_path, \"metadata.zip\")\n\n        group = zarr.group(zarr.storage.ZipStore(metadata))\n\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        for key, value in data_dict.items():\n            # sub=group.create_group(key)\n            if key in [\"name\", \"subset\"]:\n                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)\n                data[:] = value[0][:]\n                data2 = group.create_dataset(key + \"_ptr\", shape=value[1].shape, dtype=np.int32)\n                data2[:] = value[1][:]\n            else:\n                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)\n                data[:] = value[:]\n        local_paths.append(metadata)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = self.join_and_ext(preprocess_path, \"metadata\")\n        pull_locally(filename, overwrite=overwrite)\n        tmp = self.load_fn(filename)\n        all_pkl_keys = set(tmp.keys()) - set(data_keys)\n        # assert required pkl_keys are present in all_pkl_keys\n        assert all([key in all_pkl_keys for key in pkl_data_keys])\n        for key in all_pkl_keys:\n            if key not in pkl_data_keys:\n                data[key] = tmp[key][:][tmp[key][:]]\n            else:\n                data[key] = tmp[key][:]\n        return data\n
    "},{"location":"API/methods.html","title":"QM Methods","text":""},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod","title":"InteractionMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class InteractionMethod(QmMethod):\n    CCSD_T_NN = Functional.CCSDT, BasisSet.NN\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    DCCSDT_HA_DZ = Functional.DCCSDT, BasisSet.HA_DZ\n    DCCSDT_HA_TZ = Functional.DCCSDT, BasisSet.HA_TZ\n    DLPNO_CCSDT = Functional.DLPNO_CCSDT, BasisSet.NONE\n    DLPNO_CCSDT0 = (\n        Functional.DLPNO_CCSDT0,\n        BasisSet.NONE,\n    )\n    FN_DMC = Functional.FN_DMC, BasisSet.NONE\n    FIXED = Functional.FIXED, BasisSet.NONE\n    LNO_CCSDT = Functional.LNO_CCSDT, BasisSet.NONE\n    MP2_CBS = Functional.MP2, BasisSet.CBS\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MP2_5_CBS_ADZ = Functional.MP2_5, BasisSet.CBS_ADZ\n    MP2C_CBS = Functional.MP2C, BasisSet.CBS\n    QCISDT_CBS = Functional.QCISDT, BasisSet.CBS\n    SAPT0_AUG_CC_PWCVXZ = Functional.SAPT0, BasisSet.AUG_CC_PWCVXZ\n    SAPT0_JUN_CC_PVDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDZ\n    SAPT0_JUN_CC_PVDDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDDZ\n    SAPT0_AUG_CC_PVDDZ = Functional.SAPT0, BasisSet.AUG_CC_PVDDZ\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get an empty atomization energy dictionary because Interaction methods don't require this\"\"\"\n        return {}\n
    "},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get an empty atomization energy dictionary because Interaction methods don't require this

    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod","title":"PotentialMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1\n    B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP\n    B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ\n    B1LYP_VWN5_TZP = Functional.B1LYP_VWN5, BasisSet.TZP\n    B1PW91_VWN5_DZP = Functional.B1PW91_VWN5, BasisSet.DZP\n    B1PW91_VWN5_SZ = Functional.B1PW91_VWN5, BasisSet.SZ\n    B1PW91_VWN5_TZP = Functional.B1PW91_VWN5, BasisSet.TZP\n    B3LYP_STO3G = Functional.B3LYP, BasisSet.STO3G  # TODO: calculate e0s\n    B3LYP_VWN5_DZP = Functional.B3LYP_VWN5, BasisSet.DZP\n    B3LYP_VWN5_SZ = Functional.B3LYP_VWN5, BasisSet.SZ\n    B3LYP_VWN5_TZP = Functional.B3LYP_VWN5, BasisSet.TZP\n    B3LYP_S_VWN5_DZP = Functional.B3LYP_S_VWN5, BasisSet.DZP\n    B3LYP_S_VWN5_SZ = Functional.B3LYP_S_VWN5, BasisSet.SZ\n    B3LYP_S_VWN5_TZP = Functional.B3LYP_S_VWN5, BasisSet.TZP\n    B3LYP_D_DZP = Functional.B3LYPD, BasisSet.DZP\n    B3LYP_D_SZ = Functional.B3LYPD, BasisSet.SZ\n    B3LYP_D_TZP = Functional.B3LYPD, BasisSet.TZP\n    B3LYP_D3_BJ_DEF2_TZVP = Functional.B3LYP_D3_BJ, BasisSet.DEF2_TZVP\n    B3LYP_6_31G_D = Functional.B3LYP, BasisSet.GSTAR\n    B3LYP_DEF2_TZVP = Functional.B3LYP, BasisSet.DEF2_TZVP\n    B97_1_DZP = Functional.B97_1, BasisSet.DZP\n    B97_1_SZ = Functional.B97_1, BasisSet.SZ\n    B97_1_TZP = Functional.B97_1, BasisSet.TZP\n    B97_2_DZP = Functional.B97_2, BasisSet.DZP\n    B97_2_SZ = Functional.B97_2, BasisSet.SZ\n    B97_2_TZP = Functional.B97_2, BasisSet.TZP\n    B97_D_DZP = Functional.B97_D, BasisSet.DZP\n    B97_D_SZ = Functional.B97_D, BasisSet.SZ\n    B97_D_TZP = Functional.B97_D, BasisSet.TZP\n    B97_DZP = Functional.B97, BasisSet.DZP\n    B97_SZ = Functional.B97, BasisSet.SZ\n    B97_TZP = Functional.B97, BasisSet.TZP\n    BECKE00_X_ONLY_DZP = Functional.BECKE00_X_ONLY, BasisSet.DZP\n    BECKE00_X_ONLY_SZ = Functional.BECKE00_X_ONLY, BasisSet.SZ\n    BECKE00_X_ONLY_TZP = Functional.BECKE00_X_ONLY, BasisSet.TZP\n    BECKE00_DZP = Functional.BECKE00, BasisSet.DZP\n    BECKE00_SZ = Functional.BECKE00, BasisSet.SZ\n    BECKE00_TZP = Functional.BECKE00, BasisSet.TZP\n    BECKE00X_XC_DZP = Functional.BECKE00X_XC, BasisSet.DZP\n    BECKE00X_XC_SZ = Functional.BECKE00X_XC, BasisSet.SZ\n    BECKE00X_XC_TZP = Functional.BECKE00X_XC, BasisSet.TZP\n    BECKE88X_BR89C_DZP = Functional.BECKE88X_BR89C, BasisSet.DZP\n    BECKE88X_BR89C_SZ = Functional.BECKE88X_BR89C, BasisSet.SZ\n    BECKE88X_BR89C_TZP = Functional.BECKE88X_BR89C, BasisSet.TZP\n    BHANDH_DZP = Functional.BHANDH, BasisSet.DZP\n    BHANDH_SZ = Functional.BHANDH, BasisSet.SZ\n    BHANDH_TZP = Functional.BHANDH, BasisSet.TZP\n    BHANDHLYP_DZP = Functional.BHANDHLYP, BasisSet.DZP\n    BHANDHLYP_SZ = Functional.BHANDHLYP, BasisSet.SZ\n    BHANDHLYP_TZP = Functional.BHANDHLYP, BasisSet.TZP\n    BLAP3_DZP = Functional.BLAP3, BasisSet.DZP\n    BLAP3_SZ = Functional.BLAP3, BasisSet.SZ\n    BLAP3_TZP = Functional.BLAP3, BasisSet.TZP\n    BLYP_D_DZP = Functional.BLYPD, BasisSet.DZP\n    BLYP_D_SZ = Functional.BLYPD, BasisSet.SZ\n    BLYP_D_TZP = Functional.BLYPD, BasisSet.TZP\n    BLYP_DZP = Functional.BLYP, BasisSet.DZP\n    BLYP_SZ = Functional.BLYP, BasisSet.SZ\n    BLYP_TZP = Functional.BLYP, BasisSet.TZP\n    BMTAU1_DZP = Functional.BMTAU1, BasisSet.DZP\n    BMTAU1_SZ = Functional.BMTAU1, BasisSet.SZ\n    BMTAU1_TZP = Functional.BMTAU1, BasisSet.TZP\n    BOP_DZP = Functional.BOP, BasisSet.DZP\n    BOP_SZ = Functional.BOP, BasisSet.SZ\n    BOP_TZP = Functional.BOP, BasisSet.TZP\n    BP_DZP = Functional.BP, BasisSet.DZP\n    BP_SZ = Functional.BP, BasisSet.SZ\n    BP_TZP = Functional.BP, BasisSet.TZP\n    BP86_D_DZP = Functional.BP86_D, BasisSet.DZP\n    BP86_D_SZ = Functional.BP86_D, BasisSet.SZ\n    BP86_D_TZP = Functional.BP86_D, BasisSet.TZP\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ\n\n    DFT3B = Functional.DFT3B, BasisSet.NONE\n    DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP\n    FT97_DZP = Functional.FT97, BasisSet.DZP\n    FT97_SZ = Functional.FT97, BasisSet.SZ\n    FT97_TZP = Functional.FT97, BasisSet.TZP\n    GFN1_XTB = Functional.GFN1_XTB, BasisSet.NONE\n    GFN2_XTB = Functional.GFN2_XTB, BasisSet.NONE\n    HCTH_120_DZP = Functional.HCTH_120, BasisSet.DZP\n    HCTH_120_SZ = Functional.HCTH_120, BasisSet.SZ\n    HCTH_120_TZP = Functional.HCTH_120, BasisSet.TZP\n    HCTH_147_DZP = Functional.HCTH_147, BasisSet.DZP\n    HCTH_147_SZ = Functional.HCTH_147, BasisSet.SZ\n    HCTH_147_TZP = Functional.HCTH_147, BasisSet.TZP\n    HCTH_407_DZP = Functional.HCTH_407, BasisSet.DZP\n    HCTH_407_SZ = Functional.HCTH_407, BasisSet.SZ\n    HCTH_407_TZP = Functional.HCTH_407, BasisSet.TZP\n    HCTH_93_DZP = Functional.HCTH_93, BasisSet.DZP\n    HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ\n    HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP\n    HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP\n    HF_CC_PVDZ = (\n        Functional.HF,\n        BasisSet.CC_PVDZ,\n    )\n    HF_CC_PVQZ = (\n        Functional.HF,\n        BasisSet.CC_PVQZ,\n    )\n    HF_CC_PVTZ = (\n        Functional.HF,\n        BasisSet.CC_PVTZ,\n    )\n    KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP\n    KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ\n    KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP\n    KCIS_ORIGINAL_DZP = Functional.KCIS_ORIGINAL, BasisSet.DZP\n    KCIS_ORIGINAL_SZ = Functional.KCIS_ORIGINAL, BasisSet.SZ\n    KCIS_ORIGINAL_TZP = Functional.KCIS_ORIGINAL, BasisSet.TZP\n    KMLYP_VWN5_DZP = Functional.KMLYP_VWN5, BasisSet.DZP\n    KMLYP_VWN5_SZ = Functional.KMLYP_VWN5, BasisSet.SZ\n    KMLYP_VWN5_TZP = Functional.KMLYP_VWN5, BasisSet.TZP\n    KT1_DZP = Functional.KT1, BasisSet.DZP\n    KT1_SZ = Functional.KT1, BasisSet.SZ\n    KT1_TZP = Functional.KT1, BasisSet.TZP\n    KT2_DZP = Functional.KT2, BasisSet.DZP\n    KT2_SZ = Functional.KT2, BasisSet.SZ\n    KT2_TZP = Functional.KT2, BasisSet.TZP\n    LDA_VWN_DZP = Functional.LDA_VWN, BasisSet.DZP\n    LDA_VWN_SZ = Functional.LDA_VWN, BasisSet.SZ\n    LDA_VWN_TZP = Functional.LDA_VWN, BasisSet.TZP\n    M05_2X_DZP = Functional.M05_2X, BasisSet.DZP\n    M05_2X_SZ = Functional.M05_2X, BasisSet.SZ\n    M05_2X_TZP = Functional.M05_2X, BasisSet.TZP\n    M05_DZP = Functional.M05, BasisSet.DZP\n    M05_SZ = Functional.M05, BasisSet.SZ\n    M05_TZP = Functional.M05, BasisSet.TZP\n    M06_2X_DZP = Functional.M06_2X, BasisSet.DZP\n    M06_2X_SZ = Functional.M06_2X, BasisSet.SZ\n    M06_2X_TZP = Functional.M06_2X, BasisSet.TZP\n    M06_L_DZP = Functional.M06_L, BasisSet.DZP\n    M06_L_SZ = Functional.M06_L, BasisSet.SZ\n    M06_L_TZP = Functional.M06_L, BasisSet.TZP\n    M06_DZP = Functional.M06, BasisSet.DZP\n    M06_SZ = Functional.M06, BasisSet.SZ\n    M06_TZP = Functional.M06, BasisSet.TZP\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MPBE_DZP = Functional.MPBE, BasisSet.DZP\n    MPBE_SZ = Functional.MPBE, BasisSet.SZ\n    MPBE_TZP = Functional.MPBE, BasisSet.TZP\n    MPBE0KCIS_DZP = Functional.MPBE0KCIS, BasisSet.DZP\n    MPBE0KCIS_SZ = Functional.MPBE0KCIS, BasisSet.SZ\n    MPBE0KCIS_TZP = Functional.MPBE0KCIS, BasisSet.TZP\n    MPBE1KCIS_DZP = Functional.MPBE1KCIS, BasisSet.DZP\n    MPBE1KCIS_SZ = Functional.MPBE1KCIS, BasisSet.SZ\n    MPBE1KCIS_TZP = Functional.MPBE1KCIS, BasisSet.TZP\n    MPBEKCIS_DZP = Functional.MPBEKCIS, BasisSet.DZP\n    MPBEKCIS_SZ = Functional.MPBEKCIS, BasisSet.SZ\n    MPBEKCIS_TZP = Functional.MPBEKCIS, BasisSet.TZP\n    MPW_DZP = Functional.MPW, BasisSet.DZP\n    MPW_SZ = Functional.MPW, BasisSet.SZ\n    MPW_TZP = Functional.MPW, BasisSet.TZP\n    MPW1K_DZP = Functional.MPW1K, BasisSet.DZP\n    MPW1K_SZ = Functional.MPW1K, BasisSet.SZ\n    MPW1K_TZP = Functional.MPW1K, BasisSet.TZP\n    MPW1PW_DZP = Functional.MPW1PW, BasisSet.DZP\n    MPW1PW_SZ = Functional.MPW1PW, BasisSet.SZ\n    MPW1PW_TZP = Functional.MPW1PW, BasisSet.TZP\n    MVS_DZP = Functional.MVS, BasisSet.DZP\n    MVS_SZ = Functional.MVS, BasisSet.SZ\n    MVS_TZP = Functional.MVS, BasisSet.TZP\n    MVSX_DZP = Functional.MVSX, BasisSet.DZP\n    MVSX_SZ = Functional.MVSX, BasisSet.SZ\n    MVSX_TZP = Functional.MVSX, BasisSet.TZP\n    O3LYP_VWN5_DZP = Functional.O3LYP_VWN5, BasisSet.DZP\n    O3LYP_VWN5_SZ = Functional.O3LYP_VWN5, BasisSet.SZ\n    O3LYP_VWN5_TZP = Functional.O3LYP_VWN5, BasisSet.TZP\n    OLAP3_DZP = Functional.OLAP3, BasisSet.DZP\n    OLAP3_SZ = Functional.OLAP3, BasisSet.SZ\n    OLAP3_TZP = Functional.OLAP3, BasisSet.TZP\n    OLYP_DZP = Functional.OLYP, BasisSet.DZP\n    OLYP_SZ = Functional.OLYP, BasisSet.SZ\n    OLYP_TZP = Functional.OLYP, BasisSet.TZP\n    OPBE_DZP = Functional.OPBE, BasisSet.DZP\n    OPBE_SZ = Functional.OPBE, BasisSet.SZ\n    OPBE_TZP = Functional.OPBE, BasisSet.TZP\n    OPBE0_DZP = Functional.OPBE0, BasisSet.DZP\n    OPBE0_SZ = Functional.OPBE0, BasisSet.SZ\n    OPBE0_TZP = Functional.OPBE0, BasisSet.TZP\n    OPERDEW_DZP = Functional.OPERDEW, BasisSet.DZP\n    OPERDEW_SZ = Functional.OPERDEW, BasisSet.SZ\n    OPERDEW_TZP = Functional.OPERDEW, BasisSet.TZP\n    PBE_D_DZP = Functional.PBE_D, BasisSet.DZP\n    PBE_D_SZ = Functional.PBE_D, BasisSet.SZ\n    PBE_D_TZP = Functional.PBE_D, BasisSet.TZP\n    PBE_D3_BJ_DEF2_TZVP = Functional.PBE_D3_BJ, BasisSet.DEF2_TZVP\n    PBE_DEF2_TZVP = Functional.PBE, BasisSet.DEF2_TZVP\n    PBE_DZP = Functional.PBE, BasisSet.DZP\n    PBE_SZ = Functional.PBE, BasisSet.SZ\n    PBE_TZP = Functional.PBE, BasisSet.TZP\n    PBE0_DZP = Functional.PBE0, BasisSet.DZP\n    PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP\n    PBE0_SZ = Functional.PBE0, BasisSet.SZ\n    PBE0_TZP = Functional.PBE0, BasisSet.TZP\n    PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD\n    PBESOL_DZP = Functional.PBESOL, BasisSet.DZP\n    PBESOL_SZ = Functional.PBESOL, BasisSet.SZ\n    PBESOL_TZP = Functional.PBESOL, BasisSet.TZP\n    PKZB_DZP = Functional.PKZB, BasisSet.DZP\n    PKZB_SZ = Functional.PKZB, BasisSet.SZ\n    PKZB_TZP = Functional.PKZB, BasisSet.TZP\n    PKZBX_KCISCOR_DZP = Functional.PKZBX_KCISCOR, BasisSet.DZP\n    PKZBX_KCISCOR_SZ = Functional.PKZBX_KCISCOR, BasisSet.SZ\n    PKZBX_KCISCOR_TZP = Functional.PKZBX_KCISCOR, BasisSet.TZP\n    PM6 = Functional.PM6, BasisSet.NONE\n    PW91_DZP = Functional.PW91, BasisSet.DZP\n    PW91_SZ = Functional.PW91, BasisSet.SZ\n    PW91_TZP = Functional.PW91, BasisSet.TZP\n    REVPBE_D3_BJ_DEF2_TZVP = Functional.REVPBE_D3_BJ, BasisSet.DEF2_TZVP\n    REVPBE_DZP = Functional.REVPBE, BasisSet.DZP\n    REVPBE_SZ = Functional.REVPBE, BasisSet.SZ\n    REVPBE_TZP = Functional.REVPBE, BasisSet.TZP\n    REVTPSS_DZP = Functional.REVTPSS, BasisSet.DZP\n    REVTPSS_SZ = Functional.REVTPSS, BasisSet.SZ\n    REVTPSS_TZP = Functional.REVTPSS, BasisSet.TZP\n    RGE2_DZP = Functional.RGE2, BasisSet.DZP\n    RGE2_SZ = Functional.RGE2, BasisSet.SZ\n    RGE2_TZP = Functional.RGE2, BasisSet.TZP\n    RPBE_DZP = Functional.RPBE, BasisSet.DZP\n    RPBE_SZ = Functional.RPBE, BasisSet.SZ\n    RPBE_TZP = Functional.RPBE, BasisSet.TZP\n    SSB_D_DZP = Functional.SSB_D, BasisSet.DZP\n    SSB_D_SZ = Functional.SSB_D, BasisSet.SZ\n    SSB_D_TZP = Functional.SSB_D, BasisSet.TZP\n    SVWN_DEF2_TZVP = Functional.SVWN, BasisSet.DEF2_TZVP\n    TMGGA_DZP = Functional.TMGGA, BasisSet.DZP\n    TMGGA_SZ = Functional.TMGGA, BasisSet.SZ\n    TMGGA_TZP = Functional.TMGGA, BasisSet.TZP\n    TAU_HCTH_HYBRID_DZP = Functional.TAU_HCTH_HYBRID, BasisSet.DZP\n    TAU_HCTH_HYBRID_SZ = Functional.TAU_HCTH_HYBRID, BasisSet.SZ\n    TAU_HCTH_HYBRID_TZP = Functional.TAU_HCTH_HYBRID, BasisSet.TZP\n    TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP\n    TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ\n    TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP\n    TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ\n    TPSSD_DZP = Functional.TPSSD, BasisSet.DZP\n    TPSSD_SZ = Functional.TPSSD, BasisSet.SZ\n    TPSSD_TZP = Functional.TPSSD, BasisSet.TZP\n    TPSS_DZP = Functional.TPSS, BasisSet.DZP\n    TPSS_SZ = Functional.TPSS, BasisSet.SZ\n    TPSS_TZP = Functional.TPSS, BasisSet.TZP\n    TPSSH_DEF2_TZVP = Functional.TPSSH, BasisSet.DEF2_TZVP\n    TPSSH_DZP = Functional.TPSSH, BasisSet.DZP\n    TPSSH_SZ = Functional.TPSSH, BasisSet.SZ\n    TPSSH_TZP = Functional.TPSSH, BasisSet.TZP\n    TTM2_1_F = Functional.TTM2_1_F, BasisSet.NONE\n    VS98_X_XC_DZP = Functional.VS98_X_XC, BasisSet.DZP\n    VS98_X_XC_SZ = Functional.VS98_X_XC, BasisSet.SZ\n    VS98_X_XC_TZP = Functional.VS98_X_XC, BasisSet.TZP\n    VS98_X_ONLY_DZP = Functional.VS98_X_ONLY, BasisSet.DZP\n    VS98_X_ONLY_SZ = Functional.VS98_X_ONLY, BasisSet.SZ\n    VS98_X_ONLY_TZP = Functional.VS98_X_ONLY, BasisSet.TZP\n    VS98_DZP = Functional.VS98, BasisSet.DZP\n    VS98_SZ = Functional.VS98, BasisSet.SZ\n    VS98_TZP = Functional.VS98, BasisSet.TZP\n    WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD\n    WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP\n    WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP\n    WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ\n    WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR\n    WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ\n    X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP\n    X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ\n    X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP\n    XLYP_DZP = Functional.XLYP, BasisSet.DZP\n    XLYP_SZ = Functional.XLYP, BasisSet.SZ\n    XLYP_TZP = Functional.XLYP, BasisSet.TZP\n    NONE = Functional.NONE, BasisSet.NONE\n\n    def _build_default_dict(self):\n        e0_dict = {}\n        for SYMBOL in ATOM_SYMBOLS:\n            for CHARGE in range(-10, 11):\n                e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32)\n        return e0_dict\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        key = str(self)\n        try:\n            # print(key)\n            energies = atom_energy_collection.get(key, {})\n            if len(energies) == 0:\n                raise\n        except:  # noqa\n            logger.info(f\"No available atomization energy for the QM method {key}. All values are set to 0.\")\n            energies = self._build_default_dict()\n        return energies\n
    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod","title":"QmMethod","text":"

    Bases: Enum

    Source code in openqdc/methods/enums.py
    class QmMethod(Enum):\n    def __init__(self, functional: Functional, basis_set: BasisSet, cost: float = 0):\n        self.functional = functional\n        self.basis_set = basis_set\n        self.cost = cost\n\n    def __str__(self):\n        if self.basis_set != \"\":\n            s = \"/\".join([str(self.functional), str(self.basis_set)])\n        else:\n            s = str(self.functional)\n        return s\n\n    @property\n    def atom_energies_matrix(self):\n        \"\"\"Get the atomization energy matrix\"\"\"\n        energies = self.atom_energies_dict\n        mat = to_e_matrix(energies)\n\n        return mat\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_matrix","title":"atom_energies_matrix property","text":"

    Get the atomization energy matrix

    "},{"location":"API/methods.html#isolated-atom-energies","title":"Isolated Atom Energies","text":""},{"location":"API/methods.html#openqdc.methods.atom_energies.to_e_matrix","title":"to_e_matrix(atom_energies)","text":"

    Get the matrix of isolated atom energies for a dict of non-null values calculates

    Parameters:

    Name Type Description Default atom_energies Dict

    Dict of energies computed for a given QM method. Keys are pairs of (atom, charge) and values are energy values

    required

    np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)

    Type Description ndarray

    Matrix containing the isolated atom energies for each atom and charge written in the form:

            |   | -2 | -1 | 0 | +1 | +2 | <- charges\n        |---|----|----|---|----|----|\n        | 0 |    |    |   |    |    |\n        | 1 |    |    |   |    |    |\n        | 2 |    |    |   |    |    |\n
    Source code in openqdc/methods/atom_energies.py
    def to_e_matrix(atom_energies: Dict) -> np.ndarray:\n    \"\"\"\n    Get the matrix of isolated atom energies for a dict of non-null values calculates\n\n    Parameters:\n        atom_energies: Dict of energies computed for a given QM method.\n            Keys are pairs of (atom, charge) and values are energy values\n\n    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)\n        Matrix containing the isolated atom energies for each atom and charge written in the form:\n\n                        |   | -2 | -1 | 0 | +1 | +2 | <- charges\n                        |---|----|----|---|----|----|\n                        | 0 |    |    |   |    |    |\n                        | 1 |    |    |   |    |    |\n                        | 2 |    |    |   |    |    |\n    \"\"\"\n\n    matrix = np.zeros((MAX_ATOMIC_NUMBER, MAX_CHARGE_NUMBER))\n    if len(atom_energies) > 0:\n        for key in atom_energies.keys():\n            try:\n                matrix[ATOMIC_NUMBERS[key[0]], key[1] + MAX_CHARGE] = atom_energies[key]\n            except KeyError:\n                logger.error(f\"Isolated atom energies not found for {key}\")\n    return matrix\n
    "},{"location":"API/properties.html","title":"Defined properties for datasets","text":""},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn","title":"DatasetPropertyMixIn","text":"

    Mixin class for BaseDataset class to add properties that are common to all datasets.

    Source code in openqdc/datasets/properties.py
    class DatasetPropertyMixIn:\n    \"\"\"\n    Mixin class for BaseDataset class to add\n    properties that are common to all datasets.\n    \"\"\"\n\n    @property\n    def atoms_per_molecules(self):\n        try:\n            if hasattr(self, \"_n_atoms\"):\n                return self._n_atoms\n            self._n_atoms = self.data[\"n_atoms\"]\n            return self._n_atoms\n        except:  # noqa\n            return None\n\n    @property\n    def _stats(self):\n        return self.__stats__\n\n    def _compute_average_nb_atoms(self):\n        self.__average_nb_atoms__ = np.mean(self.data[\"n_atoms\"])\n\n    @property\n    def average_n_atoms(self) -> int:\n        \"\"\"\n        Average number of atoms in a molecule in the dataset.\n\n        Returns:\n            Average number of atoms in a molecule in the dataset.\n        \"\"\"\n        if self.__average_nb_atoms__ is None:\n            raise StatisticsNotAvailableError(self.__name__)\n        return self.__average_nb_atoms__\n\n    @property\n    def numbers(self) -> np.ndarray:\n        \"\"\"\n        Unique atomic numbers in the dataset\n\n        Returns:\n            Array of the unique atomic numbers in the dataset\n        \"\"\"\n        if hasattr(self, \"_numbers\"):\n            return self._numbers\n        self._numbers = pd.unique(self.data[\"atomic_inputs\"][..., 0]).astype(np.int32)\n        return self._numbers\n\n    @property\n    def charges(self) -> np.ndarray:\n        \"\"\"\n        Unique charges in the dataset\n\n        Returns:\n            Array of the unique charges in the dataset\n        \"\"\"\n        if hasattr(self, \"_charges\"):\n            return self._charges\n        self._charges = np.unique(self.data[\"atomic_inputs\"][..., :2], axis=0).astype(np.int32)\n        return self._charges\n\n    @property\n    def min_max_charges(self) -> Tuple[int, int]:\n        \"\"\"\n        Minimum and maximum charges in the dataset\n\n        Returns:\n            (min_charge, max_charge)\n        \"\"\"\n        if hasattr(self, \"_min_max_charges\"):\n            return self._min_max_charges\n        self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])\n        return self._min_max_charges\n\n    @property\n    def chemical_species(self) -> np.ndarray:\n        \"\"\"\n        Chemical symbols in the dataset\n\n        Returns:\n            Array of the chemical symbols in the dataset\n        \"\"\"\n        return np.array(ATOM_SYMBOLS)[self.numbers]\n
    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.average_n_atoms","title":"average_n_atoms: int property","text":"

    Average number of atoms in a molecule in the dataset.

    Returns:

    Type Description int

    Average number of atoms in a molecule in the dataset.

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.charges","title":"charges: np.ndarray property","text":"

    Unique charges in the dataset

    Returns:

    Type Description ndarray

    Array of the unique charges in the dataset

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.chemical_species","title":"chemical_species: np.ndarray property","text":"

    Chemical symbols in the dataset

    Returns:

    Type Description ndarray

    Array of the chemical symbols in the dataset

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.min_max_charges","title":"min_max_charges: Tuple[int, int] property","text":"

    Minimum and maximum charges in the dataset

    Returns:

    Type Description Tuple[int, int]

    (min_charge, max_charge)

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.numbers","title":"numbers: np.ndarray property","text":"

    Unique atomic numbers in the dataset

    Returns:

    Type Description ndarray

    Array of the unique atomic numbers in the dataset

    "},{"location":"API/regressor.html","title":"Normalization regressor","text":"

    Linear Atom Energies regression utilities.

    "},{"location":"API/regressor.html#openqdc.utils.regressor.LinearSolver","title":"LinearSolver","text":"

    Bases: Solver

    Linear regression solver.

    Note

    No Uncertainty associated as it is quite small.

    Source code in openqdc/utils/regressor.py
    class LinearSolver(Solver):\n    \"\"\"\n    Linear regression solver.\n\n    Note:\n        No Uncertainty associated as it is quite small.\n    \"\"\"\n\n    _regr_str = \"linear\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        E0s = np.linalg.lstsq(X, y, rcond=None)[0]\n        return E0s, None\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor","title":"Regressor","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies. A isolated atom energy regression problem is defined as:

    X = [n_samples, n_species] (number of atoms of each species per sample)

    Y = [n_samples, ] (energies)

    The regression problem is solved by solving the linear system X E0 = Y.

    Example

    For a sytem of 2 samples (H20, CH4)

    n_species = 3, n_samples = 2\n\nH20 = 2H , 1O -> X = [2, 1, 0]\n\nCH4 = 4C, 1H -> X = [1, 0, 4]\n\nX = [[2, 1, 0],\n    [ 1, 0, 4]]\n\nY = [[10, 20]]\n\nX E0 = Y\n

    Linear system to solve

    [[2 eH, 1 eO, 0 eC],\n[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n
    Source code in openqdc/utils/regressor.py
    class Regressor:\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n    A isolated atom energy regression problem is defined as:\\n\n    X = [n_samples, n_species] (number of atoms of each species per sample)\\n\n    Y = [n_samples, ] (energies)\\n\n    The regression problem is solved by solving the linear system X E0 = Y.\n\n    Example:\n        For a sytem of 2 samples (H20, CH4)\\n\n            n_species = 3, n_samples = 2\\n\n            H20 = 2H , 1O -> X = [2, 1, 0]\\n\n            CH4 = 4C, 1H -> X = [1, 0, 4]\\n\n            X = [[2, 1, 0],\n                [ 1, 0, 4]]\\n\n            Y = [[10, 20]]\\n\n            X E0 = Y\\n\n        Linear system to solve\\n\n            [[2 eH, 1 eO, 0 eC],\n            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n    \"\"\"\n\n    solver: Solver\n\n    def __init__(\n        self,\n        energies: np.ndarray,\n        atomic_numbers: np.ndarray,\n        position_idx_range: np.ndarray,\n        solver_type: str = \"linear\",\n        stride: int = 1,\n        subsample: Optional[Union[float, int]] = None,\n        remove_nan: bool = True,\n        *args: any,\n        **kwargs: any,\n    ):\n        \"\"\"\n        Regressor class for preparing and solving regression problem for isolated atom energies.\n\n        Parameters:\n            energies:\n                numpy array of energies in the shape (n_samples, n_energy_methods)\n            atomic_numbers:\n                numpy array of atomic numbers in the shape (n_atoms,)\n            position_idx_range:\n                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n            solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n            stride: Stride to use for the regression.\n            subsample: Sumsample the dataset.\n                If a float, it is interpreted as a fraction of the dataset to use.\n                If >1 it is interpreted as the number of samples to use.\n            remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n        \"\"\"\n        self.subsample = subsample\n        self.stride = stride\n        self.solver_type = solver_type.lower()\n        self.energies = energies\n        self.atomic_numbers = atomic_numbers\n        self.numbers = pd.unique(atomic_numbers)\n        self.position_idx_range = position_idx_range\n        self.remove_nan = remove_nan\n        self.hparams = {\n            \"subsample\": subsample,\n            \"stride\": stride,\n            \"solver_type\": solver_type,\n        }\n        self._post_init()\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n        \"\"\"\n        Initialize the regressor object from an openqdc dataset. This is the default method.\n        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n        Parameters:\n            dataset: openqdc dataset object.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n\n        Returns:\n            Instance of the regressor class.\n        \"\"\"\n        energies = dataset.data[\"energies\"]\n        position_idx_range = dataset.data[\"position_idx_range\"]\n        atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n        return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n\n    def _post_init(self):\n        if self.subsample is not None:\n            self._downsample()\n        self._prepare_inputs()\n        self.solver = self._get_solver()\n\n    def update_hparams(self, hparams):\n        self.hparams.update(hparams)\n\n    def _downsample(self):\n        if self.subsample < 1:\n            idxs = np.arange(self.energies.shape[0])\n            np.random.shuffle(idxs)\n            idxs = idxs[: int(self.energies.shape[0] * self.subsample)]\n            self.energies = self.energies[:: int(1 / self.subsample)]\n            self.position_idx_range = self.position_idx_range[:: int(1 / self.subsample)]\n        else:\n            idxs = np.random.randint(0, self.energies.shape[0], int(self.subsample))\n            self.energies = self.energies[idxs]\n            self.position_idx_range = self.position_idx_range[idxs]\n        self.update_hparams({\"idxs\": idxs})\n\n    def _get_solver(self):\n        try:\n            return AVAILABLE_SOLVERS[self.solver_type]()\n        except KeyError:\n            logger.warning(f\"Unknown solver type {self.solver_type}, defaulting to linear regression.\")\n            return LinearSolver()\n\n    def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:\n        logger.info(\"Preparing inputs for regression.\")\n        len_train = self.energies.shape[0]\n        len_zs = len(self.numbers)\n        A = np.zeros((len_train, len_zs))[:: self.stride]\n        B = self.energies[:: self.stride]\n        for i, ij in enumerate(self.position_idx_range[:: self.stride]):\n            tmp = self.atomic_numbers[ij[0] : ij[1]]\n            for j, z in enumerate(self.numbers):\n                A[i, j] = np.count_nonzero(tmp == z)\n        self.X = A\n        self.y = B\n\n    def solve(self):\n        \"\"\"\n        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n        \"\"\"\n        logger.info(f\"Solving regression with {self.solver}.\")\n        E0_list, cov_list = [], []\n        for energy_idx in range(self.y.shape[1]):\n            if self.remove_nan:\n                idxs = non_nan_idxs(self.y[:, energy_idx])\n                X, y = self.X[idxs], self.y[idxs, energy_idx]\n            else:\n                X, y = self.X, self.y[:, energy_idx]\n            E0s, cov = self.solver(X, y)\n            if cov is None:\n                cov = np.zeros_like(E0s) + 1.0\n            E0_list.append(E0s)\n            cov_list.append(cov)\n        return np.vstack(E0_list).T, np.vstack(cov_list).T\n\n    def __call__(self):\n        return self.solve()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.__init__","title":"__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies.

    Parameters:

    Name Type Description Default energies ndarray

    numpy array of energies in the shape (n_samples, n_energy_methods)

    required atomic_numbers ndarray

    numpy array of atomic numbers in the shape (n_atoms,)

    required position_idx_range ndarray

    array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset

    required solver_type str

    Type of solver to use. [\"linear\", \"ridge\"]

    'linear' stride int

    Stride to use for the regression.

    1 subsample Optional[Union[float, int]]

    Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. If >1 it is interpreted as the number of samples to use.

    None remove_nan bool

    Sanitize the dataset by removing energies samples with NaN values.

    True *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {} Source code in openqdc/utils/regressor.py
    def __init__(\n    self,\n    energies: np.ndarray,\n    atomic_numbers: np.ndarray,\n    position_idx_range: np.ndarray,\n    solver_type: str = \"linear\",\n    stride: int = 1,\n    subsample: Optional[Union[float, int]] = None,\n    remove_nan: bool = True,\n    *args: any,\n    **kwargs: any,\n):\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n\n    Parameters:\n        energies:\n            numpy array of energies in the shape (n_samples, n_energy_methods)\n        atomic_numbers:\n            numpy array of atomic numbers in the shape (n_atoms,)\n        position_idx_range:\n            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n        solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n        stride: Stride to use for the regression.\n        subsample: Sumsample the dataset.\n            If a float, it is interpreted as a fraction of the dataset to use.\n            If >1 it is interpreted as the number of samples to use.\n        remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n    \"\"\"\n    self.subsample = subsample\n    self.stride = stride\n    self.solver_type = solver_type.lower()\n    self.energies = energies\n    self.atomic_numbers = atomic_numbers\n    self.numbers = pd.unique(atomic_numbers)\n    self.position_idx_range = position_idx_range\n    self.remove_nan = remove_nan\n    self.hparams = {\n        \"subsample\": subsample,\n        \"stride\": stride,\n        \"solver_type\": solver_type,\n    }\n    self._post_init()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, *args, **kwargs) classmethod","text":"

    Initialize the regressor object from an openqdc dataset. This is the default method. args and and *kwargs are passed to the init method and depends on the specific regressor.

    Parameters:

    Name Type Description Default dataset any

    openqdc dataset object.

    required *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {}

    Returns:

    Type Description Regressor

    Instance of the regressor class.

    Source code in openqdc/utils/regressor.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n    \"\"\"\n    Initialize the regressor object from an openqdc dataset. This is the default method.\n    *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n    Parameters:\n        dataset: openqdc dataset object.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n\n    Returns:\n        Instance of the regressor class.\n    \"\"\"\n    energies = dataset.data[\"energies\"]\n    position_idx_range = dataset.data[\"position_idx_range\"]\n    atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n    return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.solve","title":"solve()","text":"

    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    def solve(self):\n    \"\"\"\n    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n    \"\"\"\n    logger.info(f\"Solving regression with {self.solver}.\")\n    E0_list, cov_list = [], []\n    for energy_idx in range(self.y.shape[1]):\n        if self.remove_nan:\n            idxs = non_nan_idxs(self.y[:, energy_idx])\n            X, y = self.X[idxs], self.y[idxs, energy_idx]\n        else:\n            X, y = self.X, self.y[:, energy_idx]\n        E0s, cov = self.solver(X, y)\n        if cov is None:\n            cov = np.zeros_like(E0s) + 1.0\n        E0_list.append(E0s)\n        cov_list.append(cov)\n    return np.vstack(E0_list).T, np.vstack(cov_list).T\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.RidgeSolver","title":"RidgeSolver","text":"

    Bases: Solver

    Ridge regression solver.

    Source code in openqdc/utils/regressor.py
    class RidgeSolver(Solver):\n    \"\"\"\n    Ridge regression solver.\n    \"\"\"\n\n    _regr_str = \"ridge\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        A = X.T @ X\n        dy = y - (np.sum(X, axis=1, keepdims=True) * y_mean).reshape(y.shape)\n        Xy = X.T @ dy\n        mean = np.linalg.solve(A, Xy)\n        sigma2 = np.var(X @ mean - dy)\n        Ainv = np.linalg.inv(A)\n        cov = np.sqrt(sigma2 * np.einsum(\"ij,kj,kl,li->i\", Ainv, X, X, Ainv))\n        mean = mean + y_mean.reshape([-1])\n        return mean, cov\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver","title":"Solver","text":"

    Bases: ABC

    Abstract class for regression solvers.

    Source code in openqdc/utils/regressor.py
    class Solver(ABC):\n    \"\"\"Abstract class for regression solvers.\"\"\"\n\n    _regr_str: str\n\n    @staticmethod\n    @abstractmethod\n    def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Main method to solve the regression problem.\n        Must be implemented in all the subclasses.\n\n        Parameters:\n            X: Input features of shape (n_samples, n_species)\n            Y: Target values of shape (n_samples,) (energy values for the regression)\n\n        Returns:\n            Tuple of predicted values and the estimated uncertainty.\n        \"\"\"\n        pass\n\n    def __call__(self, X, Y):\n        return self.solve(X, Y)\n\n    def __str__(self):\n        return self._regr_str\n\n    def __repr__(self):\n        return str(self)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver.solve","title":"solve(X, Y) abstractmethod staticmethod","text":"

    Main method to solve the regression problem. Must be implemented in all the subclasses.

    Parameters:

    Name Type Description Default X ndarray

    Input features of shape (n_samples, n_species)

    required Y ndarray

    Target values of shape (n_samples,) (energy values for the regression)

    required

    Returns:

    Type Description Tuple[ndarray, Optional[ndarray]]

    Tuple of predicted values and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    @staticmethod\n@abstractmethod\ndef solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n    \"\"\"\n    Main method to solve the regression problem.\n    Must be implemented in all the subclasses.\n\n    Parameters:\n        X: Input features of shape (n_samples, n_species)\n        Y: Target values of shape (n_samples,) (energy values for the regression)\n\n    Returns:\n        Tuple of predicted values and the estimated uncertainty.\n    \"\"\"\n    pass\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.atom_standardization","title":"atom_standardization(X, y)","text":"

    Standardize the energies and the atom counts. This will make the calculated uncertainty more meaningful.

    Source code in openqdc/utils/regressor.py
    def atom_standardization(X, y):\n    \"\"\"\n    Standardize the energies and the atom counts.\n    This will make the calculated uncertainty more\n    meaningful.\n    \"\"\"\n    X_norm = X.sum()\n    X = X / X_norm\n    y = y / X_norm\n    y_mean = y.sum() / X.sum()\n    return X, y, y_mean\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.non_nan_idxs","title":"non_nan_idxs(array)","text":"

    Return non nan indices of an array.

    Source code in openqdc/utils/regressor.py
    def non_nan_idxs(array):\n    \"\"\"\n    Return non nan indices of an array.\n    \"\"\"\n    return np.where(~np.isnan(array))[0]\n
    "},{"location":"API/statistics.html","title":"Statistics","text":""},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator","title":"AbstractStatsCalculator","text":"

    Bases: ABC

    Abstract class that defines the interface for all the calculators object and the methods to compute the statistics.

    Source code in openqdc/datasets/statistics.py
    class AbstractStatsCalculator(ABC):\n    \"\"\"\n    Abstract class that defines the interface for all\n    the calculators object and the methods to\n    compute the statistics.\n    \"\"\"\n\n    # State Dependencies of the calculator to skip part of the calculation\n    state_dependency = []\n    name = None\n\n    def __init__(\n        self,\n        name: str,\n        energy_type: Optional[str] = None,\n        force_recompute: bool = False,\n        energies: Optional[np.ndarray] = None,\n        n_atoms: Optional[np.ndarray] = None,\n        atom_species: Optional[np.ndarray] = None,\n        position_idx_range: Optional[np.ndarray] = None,\n        e0_matrix: Optional[np.ndarray] = None,\n        atom_charges: Optional[np.ndarray] = None,\n        forces: Optional[np.ndarray] = None,\n    ):\n        \"\"\"\n        Parameters:\n            name :\n                Name of the dataset for saving and loading.\n            energy_type :\n                Type of the energy for the computation of the statistics. Used for loading and saving.\n            force_recompute :\n                Flag to force the recomputation of the statistics\n            energies : n\n                Energies of the dataset\n            n_atoms :\n                Number of atoms in the dataset\n            atom_species :\n                Atomic species of the dataset\n            position_idx_range : n\n                Position index range of the dataset\n            e0_matrix :\n                Isolated atom energies matrix of the dataset\n            atom_charges :\n                Atomic charges of the dataset\n            forces :\n                Forces of the dataset\n        \"\"\"\n        self.name = name\n        self.energy_type = energy_type\n        self.force_recompute = force_recompute\n        self.energies = energies\n        self.forces = forces\n        self.position_idx_range = position_idx_range\n        self.e0_matrix = e0_matrix\n        self.n_atoms = n_atoms\n        self.atom_species_charges_tuple = (atom_species, atom_charges)\n        self._root = p_join(get_local_cache(), self.name)\n        if atom_species is not None and atom_charges is not None:\n            # by value not reference\n            self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n\n    @property\n    def has_forces(self) -> bool:\n        return self.forces is not None\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"statistics\", self.name + f\"_{str(self)}\" + \".pkl\")\n        return path\n\n    @property\n    def root(self):\n        \"\"\"\n        Path to the dataset folder\n        \"\"\"\n        return self._root\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset, recompute: bool = False):\n        \"\"\"\n        Create a calculator object from a dataset object.\n        \"\"\"\n        obj = cls(\n            name=dataset.__name__,\n            force_recompute=recompute,\n            energy_type=dataset.energy_type,\n            energies=dataset.data[\"energies\"],\n            forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n            n_atoms=dataset.data[\"n_atoms\"],\n            position_idx_range=dataset.data[\"position_idx_range\"],\n            atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n            atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n            e0_matrix=dataset.__isolated_atom_energies__,\n        )\n        obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n        return obj\n\n    @abstractmethod\n    def compute(self) -> StatisticsResults:\n        \"\"\"\n        Abstract method to compute the statistics.\n        Must return a StatisticsResults object and be implemented\n        in all the childs\n        \"\"\"\n        raise NotImplementedError\n\n    def save_statistics(self) -> None:\n        \"\"\"\n        Save statistics file to the dataset folder as a pkl file\n        \"\"\"\n        save_pkl(self.result, self.preprocess_path)\n\n    def attempt_load(self) -> bool:\n        \"\"\"\n        Load precomputed statistics file and return the success of the operation\n        \"\"\"\n        try:\n            self.result = load_pkl(self.preprocess_path)\n            logger.info(f\"Statistics for {str(self)} loaded successfully\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n            return False\n\n    def _setup_deps(self, state: Dict) -> None:\n        \"\"\"\n        Check if the dependencies of calculators are satisfied\n        from the state object and set the attributes of the calculator\n        to skip part of the calculation\n        \"\"\"\n        self.state = state\n        self.deps_satisfied = all([dep in state for dep in self.state_dependency])\n        if self.deps_satisfied:\n            for dep in self.state_dependency:\n                setattr(self, dep, state[dep])\n\n    def write_state(self, update: Dict) -> None:\n        \"\"\"\n        Write/update the state dictionary with the update dictionary\n\n        update:\n            dictionary containing the update to the state\n        \"\"\"\n        self.state.update(update)\n\n    def run(self, state: Dict) -> None:\n        \"\"\"\n        Main method to run the calculator.\n        Setup the dependencies from the state dictionary\n        Check if the statistics are already computed and load them or\n        recompute them\n        Save the statistics in the correct folder\n\n        state:\n            dictionary containing the state of the calculator\n        \"\"\"\n        self._setup_deps(state)\n        if self.force_recompute or not self.attempt_load():\n            self.result = self.compute()\n            self.save_statistics()\n\n    def __str__(self) -> str:\n        return self.__class__.__name__.lower()\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.root","title":"root property","text":"

    Path to the dataset folder

    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.__init__","title":"__init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None)","text":"

    Parameters:

    Name Type Description Default name

    Name of the dataset for saving and loading.

    required energy_type

    Type of the energy for the computation of the statistics. Used for loading and saving.

    None force_recompute

    Flag to force the recomputation of the statistics

    False energies

    n Energies of the dataset

    None n_atoms

    Number of atoms in the dataset

    None atom_species

    Atomic species of the dataset

    None position_idx_range

    n Position index range of the dataset

    None e0_matrix

    Isolated atom energies matrix of the dataset

    None atom_charges

    Atomic charges of the dataset

    None forces

    Forces of the dataset

    None Source code in openqdc/datasets/statistics.py
    def __init__(\n    self,\n    name: str,\n    energy_type: Optional[str] = None,\n    force_recompute: bool = False,\n    energies: Optional[np.ndarray] = None,\n    n_atoms: Optional[np.ndarray] = None,\n    atom_species: Optional[np.ndarray] = None,\n    position_idx_range: Optional[np.ndarray] = None,\n    e0_matrix: Optional[np.ndarray] = None,\n    atom_charges: Optional[np.ndarray] = None,\n    forces: Optional[np.ndarray] = None,\n):\n    \"\"\"\n    Parameters:\n        name :\n            Name of the dataset for saving and loading.\n        energy_type :\n            Type of the energy for the computation of the statistics. Used for loading and saving.\n        force_recompute :\n            Flag to force the recomputation of the statistics\n        energies : n\n            Energies of the dataset\n        n_atoms :\n            Number of atoms in the dataset\n        atom_species :\n            Atomic species of the dataset\n        position_idx_range : n\n            Position index range of the dataset\n        e0_matrix :\n            Isolated atom energies matrix of the dataset\n        atom_charges :\n            Atomic charges of the dataset\n        forces :\n            Forces of the dataset\n    \"\"\"\n    self.name = name\n    self.energy_type = energy_type\n    self.force_recompute = force_recompute\n    self.energies = energies\n    self.forces = forces\n    self.position_idx_range = position_idx_range\n    self.e0_matrix = e0_matrix\n    self.n_atoms = n_atoms\n    self.atom_species_charges_tuple = (atom_species, atom_charges)\n    self._root = p_join(get_local_cache(), self.name)\n    if atom_species is not None and atom_charges is not None:\n        # by value not reference\n        self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.attempt_load","title":"attempt_load()","text":"

    Load precomputed statistics file and return the success of the operation

    Source code in openqdc/datasets/statistics.py
    def attempt_load(self) -> bool:\n    \"\"\"\n    Load precomputed statistics file and return the success of the operation\n    \"\"\"\n    try:\n        self.result = load_pkl(self.preprocess_path)\n        logger.info(f\"Statistics for {str(self)} loaded successfully\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n        return False\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.compute","title":"compute() abstractmethod","text":"

    Abstract method to compute the statistics. Must return a StatisticsResults object and be implemented in all the childs

    Source code in openqdc/datasets/statistics.py
    @abstractmethod\ndef compute(self) -> StatisticsResults:\n    \"\"\"\n    Abstract method to compute the statistics.\n    Must return a StatisticsResults object and be implemented\n    in all the childs\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, recompute=False) classmethod","text":"

    Create a calculator object from a dataset object.

    Source code in openqdc/datasets/statistics.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset, recompute: bool = False):\n    \"\"\"\n    Create a calculator object from a dataset object.\n    \"\"\"\n    obj = cls(\n        name=dataset.__name__,\n        force_recompute=recompute,\n        energy_type=dataset.energy_type,\n        energies=dataset.data[\"energies\"],\n        forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n        n_atoms=dataset.data[\"n_atoms\"],\n        position_idx_range=dataset.data[\"position_idx_range\"],\n        atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n        atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n        e0_matrix=dataset.__isolated_atom_energies__,\n    )\n    obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n    return obj\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.run","title":"run(state)","text":"

    Main method to run the calculator. Setup the dependencies from the state dictionary Check if the statistics are already computed and load them or recompute them Save the statistics in the correct folder

    state

    dictionary containing the state of the calculator

    Source code in openqdc/datasets/statistics.py
    def run(self, state: Dict) -> None:\n    \"\"\"\n    Main method to run the calculator.\n    Setup the dependencies from the state dictionary\n    Check if the statistics are already computed and load them or\n    recompute them\n    Save the statistics in the correct folder\n\n    state:\n        dictionary containing the state of the calculator\n    \"\"\"\n    self._setup_deps(state)\n    if self.force_recompute or not self.attempt_load():\n        self.result = self.compute()\n        self.save_statistics()\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.save_statistics","title":"save_statistics()","text":"

    Save statistics file to the dataset folder as a pkl file

    Source code in openqdc/datasets/statistics.py
    def save_statistics(self) -> None:\n    \"\"\"\n    Save statistics file to the dataset folder as a pkl file\n    \"\"\"\n    save_pkl(self.result, self.preprocess_path)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.write_state","title":"write_state(update)","text":"

    Write/update the state dictionary with the update dictionary

    update

    dictionary containing the update to the state

    Source code in openqdc/datasets/statistics.py
    def write_state(self, update: Dict) -> None:\n    \"\"\"\n    Write/update the state dictionary with the update dictionary\n\n    update:\n        dictionary containing the update to the state\n    \"\"\"\n    self.state.update(update)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.EnergyStatistics","title":"EnergyStatistics dataclass","text":"

    Bases: StatisticsResults

    Dataclass for energy related statistics

    Source code in openqdc/datasets/statistics.py
    @dataclass\nclass EnergyStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for energy related statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.ForceStatistics","title":"ForceStatistics dataclass","text":"

    Bases: StatisticsResults

    Dataclass for force statistics

    Source code in openqdc/datasets/statistics.py
    @dataclass\nclass ForceStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for force statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n    component_mean: Optional[np.ndarray]\n    component_std: Optional[np.ndarray]\n    component_rms: Optional[np.ndarray]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.ForcesCalculatorStats","title":"ForcesCalculatorStats","text":"

    Bases: AbstractStatsCalculator

    Forces statistics calculator class

    Source code in openqdc/datasets/statistics.py
    class ForcesCalculatorStats(AbstractStatsCalculator):\n    \"\"\"\n    Forces statistics calculator class\n    \"\"\"\n\n    def compute(self) -> ForceStatistics:\n        if not self.has_forces:\n            return ForceStatistics(mean=None, std=None, component_mean=None, component_std=None, component_rms=None)\n        converted_force_data = self.forces\n        num_methods = converted_force_data.shape[2]\n        mean = np.nanmean(converted_force_data.reshape(-1, num_methods), axis=0)\n        std = np.nanstd(converted_force_data.reshape(-1, num_methods), axis=0)\n        component_mean = np.nanmean(converted_force_data, axis=0)\n        component_std = np.nanstd(converted_force_data, axis=0)\n        component_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))\n        return ForceStatistics(\n            mean=np.atleast_2d(mean),\n            std=np.atleast_2d(std),\n            component_mean=np.atleast_2d(component_mean),\n            component_std=np.atleast_2d(component_std),\n            component_rms=np.atleast_2d(component_rms),\n        )\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyInterface","title":"FormationEnergyInterface","text":"

    Bases: AbstractStatsCalculator, ABC

    Formation Energy interface calculator class. Define the use of the dependency formation_energy in the compute method

    Source code in openqdc/datasets/statistics.py
    class FormationEnergyInterface(AbstractStatsCalculator, ABC):\n    \"\"\"\n    Formation Energy interface calculator class.\n    Define the use of the dependency formation_energy in the\n    compute method\n    \"\"\"\n\n    state_dependency = [\"formation_energy\"]\n\n    def compute(self) -> EnergyStatistics:\n        # if the state has not the dependency satisfied\n        if not self.deps_satisfied:\n            # run the main computation\n            from openqdc.utils.constants import MAX_CHARGE\n\n            splits_idx = self.position_idx_range[:, 1]\n            s = np.array(self.atom_species_charges_tuple, dtype=int)\n            s[:, 1] += MAX_CHARGE\n            matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.e0_matrix]\n            converted_energy_data = self.energies\n            E = []\n            for i, matrix in enumerate(matrixs):\n                c = np.cumsum(np.append([0], matrix))[splits_idx]\n                c[1:] = c[1:] - c[:-1]\n                E.append(converted_energy_data[:, i] - c)\n        else:\n            # if the dependency is satisfied get the dependency\n            E = getattr(self, self.state_dependency[0])\n        self.write_state({self.state_dependency[0]: E})\n        E = np.array(E).T\n        return self._compute(E)\n\n    @abstractmethod\n    def _compute(self, energy) -> EnergyStatistics:\n        raise NotImplementedError\n\n    def __str__(self) -> str:\n        # override the __str__ method to add the energy type to the name\n        # to differentiate between formation and regression type\n        return f\"{self.__class__.__name__.lower()}_{self.energy_type.lower()}\"\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyStats","title":"FormationEnergyStats","text":"

    Bases: FormationEnergyInterface

    Formation Energy calculator class.

    Source code in openqdc/datasets/statistics.py
    class FormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -> EnergyStatistics:\n        formation_E_mean = np.nanmean(energy, axis=0)\n        formation_E_std = np.nanstd(energy, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(formation_E_mean), std=np.atleast_2d(formation_E_std))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.PerAtomFormationEnergyStats","title":"PerAtomFormationEnergyStats","text":"

    Bases: FormationEnergyInterface

    Per atom Formation Energy calculator class.

    Source code in openqdc/datasets/statistics.py
    class PerAtomFormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Per atom Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -> EnergyStatistics:\n        inter_E_mean = np.nanmean((energy / self.n_atoms[:, None]), axis=0)\n        inter_E_std = np.nanstd((energy / self.n_atoms[:, None]), axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(inter_E_mean), std=np.atleast_2d(inter_E_std))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager","title":"StatisticManager","text":"

    Manager class that automatically handle the shared state between the statistic calculators

    Source code in openqdc/datasets/statistics.py
    class StatisticManager:\n    \"\"\"\n    Manager class that automatically handle the shared state between\n    the statistic calculators\n    \"\"\"\n\n    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n        \"\"\"\n        Parameters:\n            dataset : openqdc.datasets.base.BaseDataset\n                The dataset object to compute the statistics\n            recompute:\n                Flag to recompute the statistics\n            *statistic_calculators:\n                List of statistic calculators to run\n        \"\"\"\n        self._state = {}\n        self._results = {}\n        self._statistic_calculators = [\n            statistic_calculators.from_openqdc_dataset(dataset, recompute)\n            for statistic_calculators in statistic_calculators\n        ]\n\n    @property\n    def state(self) -> Dict:\n        \"\"\"\n        Return the dictionary state of the manager\n\n        Returns:\n            State of the StatisticManager\n        \"\"\"\n        return self._state\n\n    def reset_state(self):\n        \"\"\"\n        Reset the state dictionary\n        \"\"\"\n        self._state = {}\n\n    def reset_results(self):\n        \"\"\"\n        Reset the results dictionary\n        \"\"\"\n        self._results = {}\n\n    def get_state(self, key: Optional[str] = None) -> Optional[Any]:\n        \"\"\"\n        Return the value of the key in the state dictionary\n\n        Parameters:\n            key: str, default = None\n        Returns:\n            the value of the key in the state dictionary\n            or the whole state dictionary if key is None\n        \"\"\"\n        if key is None:\n            return self._state\n        return self._state.get(key, None)\n\n    def has_state(self, key: str) -> bool:\n        \"\"\"\n        Check is state has key\n\n        Parameters:\n            key:\n                Key to check in the state dictionary\n\n        Returns:\n            True if the key is in the state dictionary\n        \"\"\"\n        return key in self._state\n\n    def get_results(self, as_dict: bool = False):\n        \"\"\"\n        Aggregate results from all the calculators\n\n        Parameters:\n            as_dict:\n                Flag to return the results as a dictionary\n        \"\"\"\n        results = deepcopy(self._results)\n        if as_dict:\n            return {k: v.as_dict() for k, v in results.items()}\n        return {k: v for k, v in self._results.items()}\n\n    def run_calculators(self):\n        \"\"\"\n        Run the saved calculators and save the results in the manager\n        \"\"\"\n        logger.info(\"Processing dataset statistics\")\n        for calculator in self._statistic_calculators:\n            calculator.run(self.state)\n            self._results[calculator.__class__.__name__] = calculator.result\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.state","title":"state: Dict property","text":"

    Return the dictionary state of the manager

    Returns:

    Type Description Dict

    State of the StatisticManager

    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.__init__","title":"__init__(dataset, recompute=False, *statistic_calculators)","text":"

    Parameters:

    Name Type Description Default dataset

    openqdc.datasets.base.BaseDataset The dataset object to compute the statistics

    required recompute bool

    Flag to recompute the statistics

    False *statistic_calculators AbstractStatsCalculator

    List of statistic calculators to run

    () Source code in openqdc/datasets/statistics.py
    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n    \"\"\"\n    Parameters:\n        dataset : openqdc.datasets.base.BaseDataset\n            The dataset object to compute the statistics\n        recompute:\n            Flag to recompute the statistics\n        *statistic_calculators:\n            List of statistic calculators to run\n    \"\"\"\n    self._state = {}\n    self._results = {}\n    self._statistic_calculators = [\n        statistic_calculators.from_openqdc_dataset(dataset, recompute)\n        for statistic_calculators in statistic_calculators\n    ]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_results","title":"get_results(as_dict=False)","text":"

    Aggregate results from all the calculators

    Parameters:

    Name Type Description Default as_dict bool

    Flag to return the results as a dictionary

    False Source code in openqdc/datasets/statistics.py
    def get_results(self, as_dict: bool = False):\n    \"\"\"\n    Aggregate results from all the calculators\n\n    Parameters:\n        as_dict:\n            Flag to return the results as a dictionary\n    \"\"\"\n    results = deepcopy(self._results)\n    if as_dict:\n        return {k: v.as_dict() for k, v in results.items()}\n    return {k: v for k, v in self._results.items()}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_state","title":"get_state(key=None)","text":"

    Return the value of the key in the state dictionary

    Parameters:

    Name Type Description Default key Optional[str]

    str, default = None

    None

    Returns: the value of the key in the state dictionary or the whole state dictionary if key is None

    Source code in openqdc/datasets/statistics.py
    def get_state(self, key: Optional[str] = None) -> Optional[Any]:\n    \"\"\"\n    Return the value of the key in the state dictionary\n\n    Parameters:\n        key: str, default = None\n    Returns:\n        the value of the key in the state dictionary\n        or the whole state dictionary if key is None\n    \"\"\"\n    if key is None:\n        return self._state\n    return self._state.get(key, None)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.has_state","title":"has_state(key)","text":"

    Check is state has key

    Parameters:

    Name Type Description Default key str

    Key to check in the state dictionary

    required

    Returns:

    Type Description bool

    True if the key is in the state dictionary

    Source code in openqdc/datasets/statistics.py
    def has_state(self, key: str) -> bool:\n    \"\"\"\n    Check is state has key\n\n    Parameters:\n        key:\n            Key to check in the state dictionary\n\n    Returns:\n        True if the key is in the state dictionary\n    \"\"\"\n    return key in self._state\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_results","title":"reset_results()","text":"

    Reset the results dictionary

    Source code in openqdc/datasets/statistics.py
    def reset_results(self):\n    \"\"\"\n    Reset the results dictionary\n    \"\"\"\n    self._results = {}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_state","title":"reset_state()","text":"

    Reset the state dictionary

    Source code in openqdc/datasets/statistics.py
    def reset_state(self):\n    \"\"\"\n    Reset the state dictionary\n    \"\"\"\n    self._state = {}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.run_calculators","title":"run_calculators()","text":"

    Run the saved calculators and save the results in the manager

    Source code in openqdc/datasets/statistics.py
    def run_calculators(self):\n    \"\"\"\n    Run the saved calculators and save the results in the manager\n    \"\"\"\n    logger.info(\"Processing dataset statistics\")\n    for calculator in self._statistic_calculators:\n        calculator.run(self.state)\n        self._results[calculator.__class__.__name__] = calculator.result\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults","title":"StatisticsResults","text":"

    Parent class to statistics results to provide general methods.

    Source code in openqdc/datasets/statistics.py
    class StatisticsResults:\n    \"\"\"\n    Parent class to statistics results\n    to provide general methods.\n    \"\"\"\n\n    def to_dict(self) -> Dict:\n        \"\"\"\n        Convert the class to a dictionary\n\n        Returns:\n            Dictionary representation of the class\n        \"\"\"\n        return asdict(self)\n\n    def transform(self, func: Callable):\n        \"\"\"\n        Apply a function to all the attributes of the class\n\n        Parameters:\n            func:\n                Function to apply to the attributes\n        \"\"\"\n        for k, v in self.to_dict().items():\n            if v is not None:\n                setattr(self, k, func(v))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.to_dict","title":"to_dict()","text":"

    Convert the class to a dictionary

    Returns:

    Type Description Dict

    Dictionary representation of the class

    Source code in openqdc/datasets/statistics.py
    def to_dict(self) -> Dict:\n    \"\"\"\n    Convert the class to a dictionary\n\n    Returns:\n        Dictionary representation of the class\n    \"\"\"\n    return asdict(self)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.transform","title":"transform(func)","text":"

    Apply a function to all the attributes of the class

    Parameters:

    Name Type Description Default func Callable

    Function to apply to the attributes

    required Source code in openqdc/datasets/statistics.py
    def transform(self, func: Callable):\n    \"\"\"\n    Apply a function to all the attributes of the class\n\n    Parameters:\n        func:\n            Function to apply to the attributes\n    \"\"\"\n    for k, v in self.to_dict().items():\n        if v is not None:\n            setattr(self, k, func(v))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.TotalEnergyStats","title":"TotalEnergyStats","text":"

    Bases: AbstractStatsCalculator

    Total Energy statistics calculator class

    Source code in openqdc/datasets/statistics.py
    class TotalEnergyStats(AbstractStatsCalculator):\n    \"\"\"\n    Total Energy statistics calculator class\n    \"\"\"\n\n    def compute(self) -> EnergyStatistics:\n        converted_energy_data = self.energies\n        total_E_mean = np.nanmean(converted_energy_data, axis=0)\n        total_E_std = np.nanstd(converted_energy_data, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(total_E_mean), std=np.atleast_2d(total_E_std))\n
    "},{"location":"API/units.html","title":"UNITS","text":"

    Units conversion utilities module.

    Available Energy units

    [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\" \"mev\", \"ryd]

    Available Distance units

    [\"ang\", \"nm\", \"bohr\"]

    Available Force units

    Combinations between Energy and Distance units

    "},{"location":"API/units.html#openqdc.utils.units.Conversion","title":"Conversion","text":"

    Conversion from one unit system to another defined by a name and a callable

    Source code in openqdc/utils/units.py
    class Conversion:\n    \"\"\"\n    Conversion from one unit system to another defined by a name and a callable\n    \"\"\"\n\n    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n        \"\"\"\n\n        Parameters:\n            in_unit: String defining the units of the current values\n            out_unit: String defining the target units\n            func: The callable to compute the conversion\n        \"\"\"\n        name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n        if name in CONVERSION_REGISTRY:\n            raise ConversionAlreadyDefined(in_unit, out_unit)\n        CONVERSION_REGISTRY[name] = self\n\n        self.name = name\n        self.fn = func\n\n    def __call__(self, x):\n        return self.fn(x)\n
    "},{"location":"API/units.html#openqdc.utils.units.Conversion.__init__","title":"__init__(in_unit, out_unit, func)","text":"

    Parameters:

    Name Type Description Default in_unit str

    String defining the units of the current values

    required out_unit str

    String defining the target units

    required func Callable[[float], float]

    The callable to compute the conversion

    required Source code in openqdc/utils/units.py
    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n    \"\"\"\n\n    Parameters:\n        in_unit: String defining the units of the current values\n        out_unit: String defining the target units\n        func: The callable to compute the conversion\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n    if name in CONVERSION_REGISTRY:\n        raise ConversionAlreadyDefined(in_unit, out_unit)\n    CONVERSION_REGISTRY[name] = self\n\n    self.name = name\n    self.fn = func\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion","title":"DistanceTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible distance units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass DistanceTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible distance units for conversion\n    \"\"\"\n\n    ANG = \"ang\"\n    NM = \"nm\"\n    BOHR = \"bohr\"\n\n    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the distance to the desired units.\n\n        Parameters:\n            distance: distance unit to convert to\n            fraction: whether it is distance^1 or distance^-1\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion.to","title":"to(distance, fraction=False)","text":"

    Get the conversion function to convert the distance to the desired units.

    Parameters:

    Name Type Description Default distance DistanceTypeConversion

    distance unit to convert to

    required fraction bool

    whether it is distance^1 or distance^-1

    False

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the distance to the desired units.\n\n    Parameters:\n        distance: distance unit to convert to\n        fraction: whether it is distance^1 or distance^-1\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion","title":"EnergyTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible energy units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass EnergyTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible energy units for conversion\n    \"\"\"\n\n    KCAL_MOL = \"kcal/mol\"\n    KJ_MOL = \"kj/mol\"\n    HARTREE = \"hartree\"\n    EV = \"ev\"\n    MEV = \"mev\"\n    RYD = \"ryd\"\n\n    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the energy to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n\n        Returns:\n            Callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion.to","title":"to(energy)","text":"

    Get the conversion function to convert the energy to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    Callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the energy to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n\n    Returns:\n        Callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion","title":"ForceTypeConversion","text":"

    Bases: ConversionEnum

    Define the possible foce units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass ForceTypeConversion(ConversionEnum):\n    \"\"\"\n    Define the possible foce units for conversion\n    \"\"\"\n\n    #     Name      = EnergyTypeConversion,         , DistanceTypeConversion\n    HARTREE_BOHR = EnergyTypeConversion.HARTREE, DistanceTypeConversion.BOHR\n    HARTREE_ANG = EnergyTypeConversion.HARTREE, DistanceTypeConversion.ANG\n    HARTREE_NM = EnergyTypeConversion.HARTREE, DistanceTypeConversion.NM\n    EV_BOHR = EnergyTypeConversion.EV, DistanceTypeConversion.BOHR\n    EV_ANG = EnergyTypeConversion.EV, DistanceTypeConversion.ANG\n    EV_NM = EnergyTypeConversion.EV, DistanceTypeConversion.NM\n    KCAL_MOL_BOHR = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.BOHR\n    KCAL_MOL_ANG = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.ANG\n    KCAL_MOL_NM = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.NM\n    KJ_MOL_BOHR = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.BOHR\n    KJ_MOL_ANG = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.ANG\n    KJ_MOL_NM = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.NM\n    MEV_BOHR = EnergyTypeConversion.MEV, DistanceTypeConversion.BOHR\n    MEV_ANG = EnergyTypeConversion.MEV, DistanceTypeConversion.ANG\n    MEV_NM = EnergyTypeConversion.MEV, DistanceTypeConversion.NM\n    RYD_BOHR = EnergyTypeConversion.RYD, DistanceTypeConversion.BOHR\n    RYD_ANG = EnergyTypeConversion.RYD, DistanceTypeConversion.ANG\n    RYD_NM = EnergyTypeConversion.RYD, DistanceTypeConversion.NM\n\n    def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):\n        self.energy = energy\n        self.distance = distance\n\n    def __str__(self):\n        return f\"{self.energy}/{self.distance}\"\n\n    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the force to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n            distance: distance unit to convert to\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion.to","title":"to(energy, distance)","text":"

    Get the conversion function to convert the force to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required distance DistanceTypeConversion

    distance unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the force to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n        distance: distance unit to convert to\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html","title":"Utils","text":""},{"location":"API/utils.html#openqdc.utils.check_file","title":"check_file(path)","text":"

    Checks if file present on local

    Source code in openqdc/utils/io.py
    def check_file(path) -> bool:\n    \"\"\"Checks if file present on local\"\"\"\n    return os.path.exists(path)\n
    "},{"location":"API/utils.html#openqdc.utils.create_hdf5_file","title":"create_hdf5_file(hdf5_file_path)","text":"

    Creates hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def create_hdf5_file(hdf5_file_path: str):\n    \"\"\"Creates hdf5 file with fsspec\"\"\"\n    fp = fsspec.open(hdf5_file_path, \"wb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    return h5py.File(fp, \"a\")\n
    "},{"location":"API/utils.html#openqdc.utils.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html#openqdc.utils.get_local_cache","title":"get_local_cache()","text":"

    Returns the local cache directory. It creates it if it does not exist.

    Returns:

    Name Type Description str str

    path to the local cache directory

    Source code in openqdc/utils/io.py
    def get_local_cache() -> str:\n    \"\"\"\n    Returns the local cache directory. It creates it if it does not exist.\n\n    Returns:\n        str: path to the local cache directory\n    \"\"\"\n    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))\n    os.makedirs(cache_dir, exist_ok=True)\n    return cache_dir\n
    "},{"location":"API/utils.html#openqdc.utils.get_remote_cache","title":"get_remote_cache(write_access=False)","text":"

    Returns the entry point based on the write access.

    Source code in openqdc/utils/io.py
    def get_remote_cache(write_access=False) -> str:\n    \"\"\"\n    Returns the entry point based on the write access.\n    \"\"\"\n    if write_access:\n        remote_cache = \"openqdc/v1\"  # \"gs://qmdata-public/openqdc\"\n        # remote_cache = \"gs://qmdata-public/openqdc\"\n    else:\n        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get(\"OPENQDC_DOWNLOAD_API\", \"s3\"))\n        # remote_cache = \"https://storage.googleapis.com/qmdata-public/openqdc\"\n    return remote_cache\n
    "},{"location":"API/utils.html#openqdc.utils.load_hdf5_file","title":"load_hdf5_file(hdf5_file_path)","text":"

    Loads hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def load_hdf5_file(hdf5_file_path: str):\n    \"\"\"Loads hdf5 file with fsspec\"\"\"\n    if not check_file(hdf5_file_path):\n        raise FileNotFoundError(f\"File {hdf5_file_path} does not exist on GCS and local.\")\n\n    fp = fsspec.open(hdf5_file_path, \"rb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    file = h5py.File(fp)\n\n    # inorder to enable multiprocessing:\n    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801\n    # fsspec.asyn.iothread[0] = None\n    # fsspec.asyn.loop[0] = None\n\n    return file\n
    "},{"location":"API/utils.html#openqdc.utils.load_json","title":"load_json(path)","text":"

    Loads json file

    Source code in openqdc/utils/io.py
    def load_json(path):\n    \"\"\"Loads json file\"\"\"\n    with fsspec.open(path, \"r\") as fp:  # Unpickling\n        return json.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.load_pkl","title":"load_pkl(path, check=True)","text":"

    Load pkl file

    Source code in openqdc/utils/io.py
    def load_pkl(path, check=True):\n    \"\"\"Load pkl file\"\"\"\n    if check:\n        if not check_file(path):\n            raise FileNotFoundError(f\"File {path} does not exist on GCS and local.\")\n\n    with open(path, \"rb\") as fp:  # Unpickling\n        return pkl.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.makedirs","title":"makedirs(path, exist_ok=True)","text":"

    Creates directory

    Source code in openqdc/utils/io.py
    def makedirs(path, exist_ok=True):\n    \"\"\"Creates directory\"\"\"\n    os.makedirs(path, exist_ok=exist_ok)\n
    "},{"location":"API/utils.html#openqdc.utils.read_qc_archive_h5","title":"read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)","text":"

    Extracts data from the HDF5 archive file.

    Source code in openqdc/utils/io.py
    def read_qc_archive_h5(\n    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None\n) -> List[Dict[str, np.ndarray]]:\n    \"\"\"Extracts data from the HDF5 archive file.\"\"\"\n    data = load_hdf5_file(raw_path)\n    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}\n\n    n = len(data_t[\"molecule_id\"])\n    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]\n    return samples\n
    "},{"location":"API/utils.html#openqdc.utils.save_pkl","title":"save_pkl(file, path)","text":"

    Saves pkl file

    Source code in openqdc/utils/io.py
    def save_pkl(file, path):\n    \"\"\"Saves pkl file\"\"\"\n    logger.info(f\"Saving file at {path}\")\n    with fsspec.open(path, \"wb\") as fp:  # Pickling\n        pkl.dump(file, fp)\n
    "},{"location":"API/utils.html#openqdc.utils.set_cache_dir","title":"set_cache_dir(d)","text":"

    Optionally set the _OPENQDC_CACHE_DIR directory.

    Parameters:

    Name Type Description Default d str

    path to a local folder.

    required Source code in openqdc/utils/io.py
    def set_cache_dir(d):\n    r\"\"\"\n    Optionally set the _OPENQDC_CACHE_DIR directory.\n\n    Args:\n        d (str): path to a local folder.\n    \"\"\"\n    if d is None:\n        return\n    global _OPENQDC_CACHE_DIR\n    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))\n
    "},{"location":"API/datasets/alchemy.html","title":"Alchemy","text":""},{"location":"API/datasets/alchemy.html#openqdc.datasets.potential.alchemy.Alchemy","title":"Alchemy","text":"

    Bases: BaseDataset

    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange matrix.

    Usage:

    from openqdc.datasets import Alchemy\ndataset = Alchemy()\n

    Reference

    https://arxiv.org/abs/1906.09427 https://alchemy.tencent.com/

    Source code in openqdc/datasets/potential/alchemy.py
    class Alchemy(BaseDataset):\n    \"\"\"\n    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.\n    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level\n    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used\n    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G\n    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the\n    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The\n    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange\n    matrix.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Alchemy\n    dataset = Alchemy()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/1906.09427\n        https://alchemy.tencent.com/\n    \"\"\"\n\n    __name__ = \"alchemy\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\"alchemy.zip\": \"https://alchemy.tencent.com/data/alchemy-v20191129.zip\"}\n\n    def read_raw_entries(self):\n        dir_path = p_join(self.root, \"Alchemy-v20191129\")\n        full_csv = pd.read_csv(p_join(dir_path, \"final_version.csv\"))\n        energies = full_csv[\"U0\\n(Ha, internal energy at 0 K)\"].tolist()\n        atom_folder = full_csv[\"atom number\"]\n        gdb_idx = full_csv[\"gdb_idx\"]\n        idxs = full_csv.index.tolist()\n        samples = []\n        for i in tqdm(idxs):\n            sdf_file = p_join(dir_path, f\"atom_{atom_folder[i]}\", f\"{gdb_idx[i]}.sdf\")\n            energy = energies[i]\n            samples.append(read_mol(sdf_file, energy))\n        return samples\n
    "},{"location":"API/datasets/ani.html","title":"ANI","text":""},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1","title":"ANI1","text":"

    Bases: BaseDataset

    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.

    Usage:

    from openqdc.datasets import ANI1\ndataset = ANI1()\n

    References

    https://www.nature.com/articles/sdata2017193

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1(BaseDataset):\n    \"\"\"\n    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic\n    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the\n    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules\n    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary\n    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT\n    level.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1\n    dataset = ANI1()\n    ```\n\n    References:\n        https://www.nature.com/articles/sdata2017193\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"ani1.hdf5.gz\": \"https://zenodo.org/record/3585840/files/214.hdf5.gz\"}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"ani\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"ani\", links=self.__links__)\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)\n        return samples\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX","title":"ANI1CCX","text":"

    Bases: ANI1

    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.

    Usage:

    from openqdc.datasets import ANI1CCX\ndataset = ANI1CCX()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX(ANI1):\n    \"\"\"\n    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active\n    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX\n    dataset = ANI1CCX()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_T_CBS,  # \"ccsd(t)/cbs\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVTZ,  # \"ccsd(t)/cc-pvtz\",\n        PotentialMethod.TCSSD_T_CC_PVDZ,  # \"tccsd(t)/cc-pvdz\",\n    ]\n\n    energy_target_names = [\n        \"CCSD(T)*:CBS Total Energy\",\n        \"NPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n        \"NPNO-CCSD(T):cc-pVTZ Correlation Energy\",\n        \"TPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n    ]\n    force_target_names = []\n    __links__ = {\"ani1x.hdf5.gz\": \"https://zenodo.org/record/4081694/files/292.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/ani.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX_V2","title":"ANI1CCX_V2","text":"

    Bases: ANI1CCX

    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.

    Usage:

    from openqdc.datasets import ANI1CCX_V2\ndataset = ANI1CCX_V2()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX_V2(ANI1CCX):\n    \"\"\"\n    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels\n    for each conformation.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX_V2\n    dataset = ANI1CCX_V2()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx_v2\"\n\n    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]\n    energy_target_names = ANI1CCX.energy_target_names + [\"PM6\", \"GFN2\"]\n    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1X","title":"ANI1X","text":"

    Bases: ANI1

    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.

    Usage:

    from openqdc.datasets import ANI1X\ndataset = ANI1X()\n

    References

    https://doi.org/10.1063/1.5023802

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1X(ANI1):\n    \"\"\"\n    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to\n    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,\n    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques\n    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and\n    (4) torsion sampling.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1X\n    dataset = ANI1X()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5023802\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.HF_CC_PVDZ,\n        PotentialMethod.HF_CC_PVQZ,\n        PotentialMethod.HF_CC_PVTZ,\n        PotentialMethod.MP2_CC_PVDZ,\n        PotentialMethod.MP2_CC_PVQZ,\n        PotentialMethod.MP2_CC_PVTZ,\n        PotentialMethod.WB97X_6_31G_D,\n        PotentialMethod.WB97X_CC_PVTZ,\n    ]\n\n    energy_target_names = [\n        \"HF:cc-pVDZ Total Energy\",\n        \"HF:cc-pVQZ Total Energy\",\n        \"HF:cc-pVTZ Total Energy\",\n        \"MP2:cc-pVDZ Correlation Energy\",\n        \"MP2:cc-pVQZ Correlation Energy\",\n        \"MP2:cc-pVTZ Correlation Energy\",\n        \"wB97x:6-31G(d) Total Energy\",\n        \"wB97x:def2-TZVPP Total Energy\",\n    ]\n\n    force_target_names = [\n        \"wB97x:6-31G(d) Atomic Forces\",\n        \"wB97x:def2-TZVPP Atomic Forces\",\n    ]\n\n    __force_mask__ = [False, False, False, False, False, False, True, True]\n    __links__ = {\"ani1ccx.hdf5.gz\": \"https://zenodo.org/record/4081692/files/293.hdf5.gz\"}\n\n    def convert_forces(self, x):\n        return super().convert_forces(x) * 0.529177249  # correct the Dataset error\n\n    def __smiles_converter__(self, x):\n        return x\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI2X","title":"ANI2X","text":"

    Bases: ANI1

    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.

    Usage:

    from openqdc.datasets import ANI2X\ndataset = ANI2X()\n

    References

    https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI2X(ANI1):\n    \"\"\"\n    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.\n    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized\n    using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are\n    used for generating geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI2X\n    dataset = ANI2X()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.0c00121\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani2x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        # PotentialMethod.NONE,  # \"b973c/def2mtzvp\",\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/631gd\", # PAPER DATASET\n        # PotentialMethod.NONE,  # \"wb97md3bj/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97mv/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97x/def2tzvpp\",\n    ]\n\n    energy_target_names = [\n        # \"b973c/def2mtzvp\",\n        \"wb97x/631gd\",\n        # \"wb97md3bj/def2tzvpp\",\n        # \"wb97mv/def2tzvpp\",\n        # \"wb97x/def2tzvpp\",\n    ]\n\n    force_target_names = [\"wb97x/631gd\"]  # \"b973c/def2mtzvp\",\n\n    __force_mask__ = [True]\n    __links__ = {  # \"ANI-2x-B973c-def2mTZVP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1\", # noqa\n        # \"ANI-2x-wB97MV-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1\", # noqa\n        \"ANI-2x-wB97X-631Gd.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97X-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1\", # noqa\n    }\n\n    def __smiles_converter__(self, x):\n        return x\n\n    def read_raw_entries(self):\n        samples = []\n        for lvl_theory in self.__links__.keys():\n            raw_path = p_join(self.root, \"final_h5\", f\"{lvl_theory.split('.')[0]}.h5\")\n            samples.extend(read_ani2_h5(raw_path))\n        return samples\n
    "},{"location":"API/datasets/comp6.html","title":"Comp6","text":""},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6","title":"COMP6","text":"

    Bases: BaseDataset

    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and molecular dipoles.

    Details of the benchmark sets are as follows

    S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and

    mixed influence interactions.

    ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n

    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point calculations are performed to calculate energies and forces.

    GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n

    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal mode sampling (DNMS) is carried out to generate non-equilibrium conformations.

    GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n

    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are generated via DNMS.

    Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n\nDrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n

    Structures are optimized similar to GDB7to9.

    Usage:

    from openqdc.datasets import COMP6\ndataset = COMP6()\n

    References

    https://aip.scitation.org/doi/abs/10.1063/1.5023802

    https://github.com/isayev/COMP6

    S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d

    GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/

    GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/

    DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h

    Source code in openqdc/datasets/potential/comp6.py
    class COMP6(BaseDataset):\n    \"\"\"\n    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the\n    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and\n    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using\n    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and\n    molecular dipoles.\n\n    Details of the benchmark sets are as follows:\n        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and\n    mixed influence interactions.\\n\n        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small\n    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point\n    calculations are performed to calculate energies and forces.\\n\n        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence\n    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal\n    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\\n\n        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are\n    generated via DNMS.\\n\n        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\\n\n        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n    Structures are optimized similar to GDB7to9.\n\n    Usage:\n    ```python\n    from openqdc.datasets import COMP6\n    dataset = COMP6()\n    ```\n\n    References:\n        https://aip.scitation.org/doi/abs/10.1063/1.5023802\\n\n        https://github.com/isayev/COMP6\\n\n        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\\n\n        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\\n\n        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\\n\n        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h\n    \"\"\"\n\n    __name__ = \"comp6\"\n\n    # watchout that forces are stored as -grad(E)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"  # angstorm\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g*\",\n        PotentialMethod.B3LYP_D3_BJ_DEF2_TZVP,  # \"b3lyp-d3(bj)/def2-tzvp\",\n        PotentialMethod.B3LYP_DEF2_TZVP,  # \"b3lyp/def2-tzvp\",\n        PotentialMethod.HF_DEF2_TZVP,  # \"hf/def2-tzvp\",\n        PotentialMethod.PBE_D3_BJ_DEF2_TZVP,  # \"pbe-d3(bj)/def2-tzvp\",\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n        PotentialMethod.SVWN_DEF2_TZVP,  # \"svwn/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"Energy\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP:def2-tzvp\",\n        \"HF:def2-tzvp\",\n        \"PBE-D3M(BJ):def2-tzvp\",\n        \"PBE:def2-tzvp\",\n        \"SVWN:def2-tzvp\",\n    ]\n    __force_mask__ = [True, False, False, False, False, False, False]\n\n    force_target_names = [\n        \"Gradient\",\n    ]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        samples = []\n        for subset in [\"ani_md\", \"drugbank\", \"gdb7_9\", \"gdb10_13\", \"s66x8\", \"tripeptides\"]:\n            raw_path = p_join(self.root, f\"{subset}.h5.gz\")\n            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/comp6.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/des.html","title":"DES","text":""},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES370K","title":"DES370K","text":"

    Bases: BaseInteractionDataset, IDES

    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.

    Usage:

    from openqdc.datasets import DES370K\ndataset = DES370K()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES370K(BaseInteractionDataset, IDES):\n    \"\"\"\n    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies\n    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules\n    and ions) including water and functional groups found in proteins. Dimer geometries are generated using\n    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES370K\n    dataset = DES370K()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des370k_interaction\"\n    __filename__ = \"DES370K.csv\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVDZ,\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_CC_PVDZ,\n        InteractionMethod.CCSD_T_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"cc_MP2_all\",\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"cc_CCSD(T)_all\",\n        \"cbs_CCSD(T)_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES370K.zip\": \"https://zenodo.org/record/5676266/files/DES370K.zip\",\n    }\n\n    @property\n    def csv_path(self):\n        return os.path.join(self.root, self.__filename__)\n\n    def _create_subsets(self, **kwargs):\n        return create_subset(kwargs[\"smiles0\"], kwargs[\"smiles1\"])\n\n    def read_raw_entries(self) -> List[Dict]:\n        filepath = self.csv_path\n        logger.info(f\"Reading {self.__name__} interaction data from {filepath}\")\n        df = pd.read_csv(filepath)\n        data = []\n        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):\n            item = parse_des_df(row, self.energy_target_names)\n            item[\"subset\"] = self._create_subsets(row=row, **item)\n            item = convert_to_record(item)\n            data.append(item)\n        return data\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES5M","title":"DES5M","text":"

    Bases: DES370K

    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.

    Usage:

    from openqdc.datasets import DES5M\ndataset = DES5M()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES5M(DES370K):\n    \"\"\"\n    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies\n    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using\n    QM based optimization and MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES5M\n    dataset = DES5M()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des5m_interaction\"\n    __filename__ = \"DES5M.csv\"\n\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES5M.zip\": \"https://zenodo.org/records/5706002/files/DESS5M.zip?download=1\",\n    }\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66","title":"DESS66","text":"

    Bases: DES370K

    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66\ndataset = DESS66()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    S66: https://pubs.acs.org/doi/10.1021/ct2002946

    Source code in openqdc/datasets/interaction/des.py
    class DESS66(DES370K):\n    \"\"\"\n    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.\n    The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66\n    dataset = DESS66()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\\n\n        S66: https://pubs.acs.org/doi/10.1021/ct2002946\n    \"\"\"\n\n    __name__ = \"des_s66\"\n    __filename__ = \"DESS66.csv\"\n    __links__ = {\"DESS66.zip\": \"https://zenodo.org/records/5676284/files/DESS66.zip?download=1\"}\n\n    def _create_subsets(self, **kwargs):\n        return kwargs[\"row\"][\"system_name\"]\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66x8","title":"DESS66x8","text":"

    Bases: DESS66

    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66x8\ndataset = DESS66x8()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DESS66x8(DESS66):\n    \"\"\"\n    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve\n    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66x8\n    dataset = DESS66x8()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des_s66x8\"\n    __filename__ = \"DESS66x8.csv\"\n    __links__ = {\"DESS66x8.zip\": \"https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1\"}\n
    "},{"location":"API/datasets/gdml.html","title":"GDML","text":""},{"location":"API/datasets/gdml.html#openqdc.datasets.potential.gdml.GDML","title":"GDML","text":"

    Bases: BaseDataset

    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for each conformation are computed using the PBE + vdW-TS electronic structure method. molecular dynamics (AIMD) trajectories.

    The dataset consists of the following trajectories

    Benzene: 627000 samples

    Uracil: 133000 samples

    Naptalene: 326000 samples

    Aspirin: 211000 samples

    Salicylic Acid: 320000 samples

    Malonaldehyde: 993000 samples

    Ethanol: 555000 samples

    Toluene: 100000 samples

    Usage:

    from openqdc.datasets import GDML\ndataset = GDML()\n

    References

    https://www.science.org/doi/10.1126/sciadv.1603015 http://www.sgdml.org/#datasets

    Source code in openqdc/datasets/potential/gdml.py
    class GDML(BaseDataset):\n    \"\"\"\n    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio\n    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene\n    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin\n    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),\n    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for\n    each conformation are computed using the PBE + vdW-TS electronic structure method.\n    molecular dynamics (AIMD) trajectories.\n\n    The dataset consists of the following trajectories:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import GDML\n    dataset = GDML()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.1603015\n        http://www.sgdml.org/#datasets\n    \"\"\"\n\n    __name__ = \"gdml\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_CC_PVDZ,  # \"ccsd/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        # TODO: verify if basis set vdw-ts == def2-tzvp and\n        # it is the same in ISO17 and revmd17\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",  # MD17\n    ]\n\n    energy_target_names = [\n        \"CCSD Energy\",\n        \"CCSD(T) Energy\",\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True, True, True]\n\n    force_target_names = [\n        \"CCSD Gradient\",\n        \"CCSD(T) Gradient\",\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __links__ = {\n        \"gdb7_9.hdf5.gz\": \"https://zenodo.org/record/3588361/files/208.hdf5.gz\",\n        \"gdb10_13.hdf5.gz\": \"https://zenodo.org/record/3588364/files/209.hdf5.gz\",\n        \"drugbank.hdf5.gz\": \"https://zenodo.org/record/3588361/files/207.hdf5.gz\",\n        \"tripeptides.hdf5.gz\": \"https://zenodo.org/record/3588368/files/211.hdf5.gz\",\n        \"ani_md.hdf5.gz\": \"https://zenodo.org/record/3588341/files/205.hdf5.gz\",\n        \"s66x8.hdf5.gz\": \"https://zenodo.org/record/3588367/files/210.hdf5.gz\",\n    }\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"gdml.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"gdml\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/geom.html","title":"GEOM","text":"

    Bases: BaseDataset

    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.

    Usage:

    from openqdc.datasets import GEOM\ndataset = GEOM()\n

    References

    https://www.nature.com/articles/s41597-022-01288-4

    https://github.com/learningmatter-mit/geom

    CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d

    Source code in openqdc/datasets/potential/geom.py
    class GEOM(BaseDataset):\n    \"\"\"\n    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules\n    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.\n    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and\n    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the\n    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.\n\n    Usage:\n    ```python\n    from openqdc.datasets import GEOM\n    dataset = GEOM()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-022-01288-4\\n\n        https://github.com/learningmatter-mit/geom\\n\n        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d\n    \"\"\"\n\n    __name__ = \"geom\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    energy_target_names = [\"gfn2_xtb.energy\"]\n    force_target_names = []\n\n    partitions = [\"qm9\", \"drugs\"]\n    __links__ = {\"rdkit_folder.tar.gz\": \"https://dataverse.harvard.edu/api/access/datafile/4327252\"}\n\n    def _read_raw_(self, partition):\n        raw_path = p_join(self.root, \"rdkit_folder\")\n\n        mols = load_json(p_join(raw_path, f\"summary_{partition}.json\"))\n        mols = list(mols.items())\n\n        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731\n        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job\n        return samples\n\n    def read_raw_entries(self):\n        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])\n        return samples\n
    "},{"location":"API/datasets/iso_17.html","title":"ISO_17","text":""},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17","title":"ISO17","text":"

    Bases: BaseDataset

    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.

    Usage:

    from openqdc.datasets import ISO17\ndataset = ISO17()\n

    References

    https://arxiv.org/abs/1706.08566

    https://arxiv.org/abs/1609.08259

    https://www.nature.com/articles/sdata201422

    https://pubmed.ncbi.nlm.nih.gov/10062328/

    https://pubmed.ncbi.nlm.nih.gov/19257665/

    Source code in openqdc/datasets/potential/iso_17.py
    class ISO17(BaseDataset):\n    \"\"\"\n    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of\n    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing\n    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics\n    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient\n    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der\n    Waals correction method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ISO17\n    dataset = ISO17()\n    ```\n\n    References:\n        https://arxiv.org/abs/1706.08566\\n\n        https://arxiv.org/abs/1609.08259\\n\n        https://www.nature.com/articles/sdata201422\\n\n        https://pubmed.ncbi.nlm.nih.gov/10062328/\\n\n        https://pubmed.ncbi.nlm.nih.gov/19257665/\n    \"\"\"\n\n    __name__ = \"iso_17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"iso_17.hdf5.gz\": \"https://zenodo.org/record/3585907/files/216.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"iso_17.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"iso_17\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/iso_17.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/l7.html","title":"L7","text":""},{"location":"API/datasets/l7.html#openqdc.datasets.interaction.l7.L7","title":"L7","text":"

    Bases: YamlDataset

    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are taken from crystal X-ray data and optimized with a DFT method specific to the complex.

    Usage:

    from openqdc.datasets import L7\ndataset = L7()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct400036b

    Source code in openqdc/datasets/interaction/l7.py
    class L7(YamlDataset):\n    \"\"\"\n    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with\n    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are\n    taken from crystal X-ray data and optimized with a DFT method specific to the complex.\n\n    Usage:\n    ```python\n    from openqdc.datasets import L7\n    dataset = L7()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct400036b\n    \"\"\"\n\n    __name__ = \"l7\"\n    __energy_methods__ = [\n        InteractionMethod.QCISDT_CBS,  # \"QCISD(T)/CBS\",\n        InteractionMethod.DLPNO_CCSDT,  # \"DLPNO-CCSD(T)\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.MP2C_CBS,  # \"MP2C/CBS\",\n        InteractionMethod.FIXED,  # \"fixed\", TODO: we should remove this level of theory because unless we have a pro\n        InteractionMethod.DLPNO_CCSDT0,  # \"DLPNO-CCSD(T0)\",\n        InteractionMethod.LNO_CCSDT,  # \"LNO-CCSD(T)\",\n        InteractionMethod.FN_DMC,  # \"FN-DMC\",\n    ]\n    __links__ = {\n        \"l7.yaml\": \"http://cuby4.molecular.cz/download_datasets/l7.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/L7.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.geometry.split(\":\")[1]\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        return np.array([int(item.setup[\"molecule_a\"][\"selection\"].split(\"-\")[1])], dtype=np.int32)\n
    "},{"location":"API/datasets/md22.html","title":"MD22","text":""},{"location":"API/datasets/md22.html#openqdc.datasets.potential.md22.MD22","title":"MD22","text":"

    Bases: RevMD17

    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD level of theory.

    Usage:

    from openqdc.datasets import MD22\ndataset = MD22()\n

    Reference

    https://arxiv.org/abs/2209.14865

    Source code in openqdc/datasets/potential/md22.py
    class MD22(RevMD17):\n    \"\"\"\n    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,\n    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories\n    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD\n    level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MD22\n    dataset = MD22()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2209.14865\n    \"\"\"\n\n    __name__ = \"md22\"\n    __links__ = {\n        f\"{x}.npz\": f\"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz\"\n        for x in [\n            \"Ac-Ala3-NHMe\",\n            \"DHA\",\n            \"stachyose\",\n            \"AT-AT\",\n            \"AT-AT-CG-CG\",\n            \"double-walled_nanotube\",\n            \"buckyball-catcher\",\n        ]\n    }\n\n    def read_raw_entries(self):\n        entries_list = []\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/metcalf.html","title":"Metcalf","text":""},{"location":"API/datasets/metcalf.html#openqdc.datasets.interaction.metcalf.Metcalf","title":"Metcalf","text":"

    Bases: BaseInteractionDataset

    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various components.

    Usage:

    from openqdc.datasets import Metcalf\ndataset = Metcalf()\n

    Reference

    https://doi.org/10.1063/1.5142636

    Source code in openqdc/datasets/interaction/metcalf.py
    class Metcalf(BaseInteractionDataset):\n    \"\"\"\n    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to\n    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and\n    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various\n    components.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Metcalf\n    dataset = Metcalf()\n    ```\n\n    Reference:\n        https://doi.org/10.1063/1.5142636\n    \"\"\"\n\n    __name__ = \"metcalf\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n    ]\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = [\n        \"total energy\",\n        \"electrostatic energy\",\n        \"exchange energy\",\n        \"induction energy\",\n        \"dispersion energy\",\n    ]\n    __links__ = {\"model-data.tar.gz\": \"https://zenodo.org/records/10934211/files/model-data.tar?download=1\"}\n\n    def read_raw_entries(self) -> List[Dict]:\n        # extract in folders\n        extract_raw_tar_gz(self.root)\n        data = []\n        for filename in glob(self.root + f\"{os.sep}*.xyz\"):\n            data.extend(read_xyz(filename, self.__name__))\n        return data\n
    "},{"location":"API/datasets/molecule3d.html","title":"Molecule3D","text":""},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.Molecule3D","title":"Molecule3D","text":"

    Bases: BaseDataset

    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.

    Usage:

    from openqdc.datasets import Molecule3D\ndataset = Molecule3D()\n

    References

    https://arxiv.org/abs/2110.01717

    https://github.com/divelab/MoleculeX

    Source code in openqdc/datasets/potential/molecule3d.py
    class Molecule3D(BaseDataset):\n    \"\"\"\n    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the\n    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing\n    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,\n    or with damaged log files.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Molecule3D\n    dataset = Molecule3D()\n    ```\n\n    References:\n        https://arxiv.org/abs/2110.01717\\n\n        https://github.com/divelab/MoleculeX\n    \"\"\"\n\n    __name__ = \"molecule3d\"\n    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # \"b3lyp/6-31g*\",\n    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY\n    __energy_unit__ = \"ev\"  # CALCULATED\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"molecule3d.zip\": \"https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy\"}\n\n    energy_target_names = [\"b3lyp/6-31g*.energy\"]\n\n    def read_raw_entries(self):\n        raw = p_join(self.root, \"data\", \"raw\")\n        sdf_paths = glob(p_join(raw, \"*.sdf\"))\n        properties_path = p_join(raw, \"properties.csv\")\n\n        fn = lambda x: _read_sdf(x, properties_path)\n        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol","title":"read_mol(mol, energy)","text":"

    Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--parameters","title":"Parameters","text":"

    mol: Chem.rdchem.Mol RDKit molecule energy: float Energy of the molecule

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--returns","title":"Returns","text":"

    res: dict Dictionary containing the following keys: - name: np.ndarray of shape (N,) containing the smiles of the molecule - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions - energies: np.ndarray of shape (1,) containing the energy of the conformer - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer - subset: np.ndarray of shape (1) containing \"molecule3d\"

    Source code in openqdc/datasets/potential/molecule3d.py
    def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:\n    \"\"\"Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies\n\n    Parameters\n    ----------\n    mol: Chem.rdchem.Mol\n        RDKit molecule\n    energy: float\n        Energy of the molecule\n\n    Returns\n    -------\n    res: dict\n        Dictionary containing the following keys:\n        - name: np.ndarray of shape (N,) containing the smiles of the molecule\n        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions\n        - energies: np.ndarray of shape (1,) containing the energy of the conformer\n        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer\n        - subset: np.ndarray of shape (1) containing \"molecule3d\"\n    \"\"\"\n    smiles = dm.to_smiles(mol, explicit_hs=False)\n    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)\n    x = get_atomic_number_and_charge(mol)\n    positions = mol.GetConformer().GetPositions()\n\n    res = dict(\n        name=np.array([smiles]),\n        subset=np.array([\"molecule3d\"]),\n        energies=np.array([energy]).astype(np.float64)[:, None],\n        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),\n        n_atoms=np.array([x.shape[0]], dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/multixcqm9.html","title":"MultixcQM9","text":""},{"location":"API/datasets/multixcqm9.html#openqdc.datasets.potential.multixcqm9.MultixcQM9","title":"MultixcQM9","text":"

    Bases: BaseDataset

    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the molecules are used directly from Kim et al. which uses G4MP2 method.

    Usage:

    from openqdc.datasets import MultixcQM9\ndataset = MultixcQM9()\n

    References

    https://www.nature.com/articles/s41597-023-02690-2

    https://github.com/chemsurajit/largeDFTdata

    https://www.nature.com/articles/s41597-019-0121-7

    Source code in openqdc/datasets/potential/multixcqm9.py
    class MultixcQM9(BaseDataset):\n    \"\"\"\n    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting\n    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets\n    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the\n    molecules are used directly from Kim et al. which uses G4MP2 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MultixcQM9\n    dataset = MultixcQM9()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-023-02690-2\\n\n        https://github.com/chemsurajit/largeDFTdata\\n\n        https://www.nature.com/articles/s41597-019-0121-7\\n\n    \"\"\"\n\n    __name__ = \"multixcqm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.KCIS_MODIFIED_DZP,\n        PotentialMethod.KCIS_ORIGINAL_DZP,\n        PotentialMethod.PKZB_DZP,\n        PotentialMethod.VS98_DZP,\n        PotentialMethod.LDA_VWN_DZP,\n        PotentialMethod.PW91_DZP,\n        PotentialMethod.BLYP_DZP,\n        PotentialMethod.BP_DZP,\n        PotentialMethod.PBE_DZP,\n        PotentialMethod.RPBE_DZP,\n        PotentialMethod.REVPBE_DZP,\n        PotentialMethod.OLYP_DZP,\n        PotentialMethod.FT97_DZP,\n        PotentialMethod.BLAP3_DZP,\n        PotentialMethod.HCTH_93_DZP,\n        PotentialMethod.HCTH_120_DZP,\n        PotentialMethod.HCTH_147_DZP,\n        PotentialMethod.HCTH_407_DZP,\n        PotentialMethod.BMTAU1_DZP,\n        PotentialMethod.BOP_DZP,\n        PotentialMethod.PKZBX_KCISCOR_DZP,\n        PotentialMethod.VS98_X_XC_DZP,\n        PotentialMethod.VS98_X_ONLY_DZP,\n        PotentialMethod.BECKE00_DZP,\n        PotentialMethod.BECKE00X_XC_DZP,\n        PotentialMethod.BECKE00_X_ONLY_DZP,\n        PotentialMethod.BECKE88X_BR89C_DZP,\n        PotentialMethod.OLAP3_DZP,\n        PotentialMethod.TPSS_DZP,\n        PotentialMethod.MPBE_DZP,\n        PotentialMethod.OPBE_DZP,\n        PotentialMethod.OPERDEW_DZP,\n        PotentialMethod.MPBEKCIS_DZP,\n        PotentialMethod.MPW_DZP,\n        PotentialMethod.TAU_HCTH_DZP,\n        PotentialMethod.XLYP_DZP,\n        PotentialMethod.KT1_DZP,\n        PotentialMethod.KT2_DZP,\n        PotentialMethod.M06_L_DZP,\n        PotentialMethod.BLYP_D_DZP,\n        PotentialMethod.BP86_D_DZP,\n        PotentialMethod.PBE_D_DZP,\n        PotentialMethod.TPSSD_DZP,\n        PotentialMethod.B97_D_DZP,\n        PotentialMethod.REVTPSS_DZP,\n        PotentialMethod.PBESOL_DZP,\n        PotentialMethod.RGE2_DZP,\n        PotentialMethod.SSB_D_DZP,\n        PotentialMethod.MVS_DZP,\n        PotentialMethod.MVSX_DZP,\n        PotentialMethod.TMGGA_DZP,\n        PotentialMethod.TPSSH_DZP,\n        PotentialMethod.B3LYP_VWN5_DZP,\n        PotentialMethod.O3LYP_VWN5_DZP,\n        PotentialMethod.KMLYP_VWN5_DZP,\n        PotentialMethod.PBE0_DZP,\n        PotentialMethod.B3LYP_S_VWN5_DZP,\n        PotentialMethod.BHANDH_DZP,\n        PotentialMethod.BHANDHLYP_DZP,\n        PotentialMethod.B97_DZP,\n        PotentialMethod.B97_1_DZP,\n        PotentialMethod.B97_2_DZP,\n        PotentialMethod.MPBE0KCIS_DZP,\n        PotentialMethod.MPBE1KCIS_DZP,\n        PotentialMethod.B1LYP_VWN5_DZP,\n        PotentialMethod.B1PW91_VWN5_DZP,\n        PotentialMethod.MPW1PW_DZP,\n        PotentialMethod.MPW1K_DZP,\n        PotentialMethod.TAU_HCTH_HYBRID_DZP,\n        PotentialMethod.X3LYP_VWN5_DZP,\n        PotentialMethod.OPBE0_DZP,\n        PotentialMethod.M05_DZP,\n        PotentialMethod.M05_2X_DZP,\n        PotentialMethod.M06_DZP,\n        PotentialMethod.M06_2X_DZP,\n        PotentialMethod.B3LYP_D_DZP,\n        PotentialMethod.KCIS_MODIFIED_TZP,\n        PotentialMethod.KCIS_ORIGINAL_TZP,\n        PotentialMethod.PKZB_TZP,\n        PotentialMethod.VS98_TZP,\n        PotentialMethod.LDA_VWN_TZP,\n        PotentialMethod.PW91_TZP,\n        PotentialMethod.BLYP_TZP,\n        PotentialMethod.BP_TZP,\n        PotentialMethod.PBE_TZP,\n        PotentialMethod.RPBE_TZP,\n        PotentialMethod.REVPBE_TZP,\n        PotentialMethod.OLYP_TZP,\n        PotentialMethod.FT97_TZP,\n        PotentialMethod.BLAP3_TZP,\n        PotentialMethod.HCTH_93_TZP,\n        PotentialMethod.HCTH_120_TZP,\n        PotentialMethod.HCTH_147_TZP,\n        PotentialMethod.HCTH_407_TZP,\n        PotentialMethod.BMTAU1_TZP,\n        PotentialMethod.BOP_TZP,\n        PotentialMethod.PKZBX_KCISCOR_TZP,\n        PotentialMethod.VS98_X_XC_TZP,\n        PotentialMethod.VS98_X_ONLY_TZP,\n        PotentialMethod.BECKE00_TZP,\n        PotentialMethod.BECKE00X_XC_TZP,\n        PotentialMethod.BECKE00_X_ONLY_TZP,\n        PotentialMethod.BECKE88X_BR89C_TZP,\n        PotentialMethod.OLAP3_TZP,\n        PotentialMethod.TPSS_TZP,\n        PotentialMethod.MPBE_TZP,\n        PotentialMethod.OPBE_TZP,\n        PotentialMethod.OPERDEW_TZP,\n        PotentialMethod.MPBEKCIS_TZP,\n        PotentialMethod.MPW_TZP,\n        PotentialMethod.TAU_HCTH_TZP,\n        PotentialMethod.XLYP_TZP,\n        PotentialMethod.KT1_TZP,\n        PotentialMethod.KT2_TZP,\n        PotentialMethod.M06_L_TZP,\n        PotentialMethod.BLYP_D_TZP,\n        PotentialMethod.BP86_D_TZP,\n        PotentialMethod.PBE_D_TZP,\n        PotentialMethod.TPSSD_TZP,\n        PotentialMethod.B97_D_TZP,\n        PotentialMethod.REVTPSS_TZP,\n        PotentialMethod.PBESOL_TZP,\n        PotentialMethod.RGE2_TZP,\n        PotentialMethod.SSB_D_TZP,\n        PotentialMethod.MVS_TZP,\n        PotentialMethod.MVSX_TZP,\n        PotentialMethod.TMGGA_TZP,\n        PotentialMethod.TPSSH_TZP,\n        PotentialMethod.B3LYP_VWN5_TZP,\n        PotentialMethod.O3LYP_VWN5_TZP,\n        PotentialMethod.KMLYP_VWN5_TZP,\n        PotentialMethod.PBE0_TZP,\n        PotentialMethod.B3LYP_S_VWN5_TZP,\n        PotentialMethod.BHANDH_TZP,\n        PotentialMethod.BHANDHLYP_TZP,\n        PotentialMethod.B97_TZP,\n        PotentialMethod.B97_1_TZP,\n        PotentialMethod.B97_2_TZP,\n        PotentialMethod.MPBE0KCIS_TZP,\n        PotentialMethod.MPBE1KCIS_TZP,\n        PotentialMethod.B1LYP_VWN5_TZP,\n        PotentialMethod.B1PW91_VWN5_TZP,\n        PotentialMethod.MPW1PW_TZP,\n        PotentialMethod.MPW1K_TZP,\n        PotentialMethod.TAU_HCTH_HYBRID_TZP,\n        PotentialMethod.X3LYP_VWN5_TZP,\n        PotentialMethod.OPBE0_TZP,\n        PotentialMethod.M05_TZP,\n        PotentialMethod.M05_2X_TZP,\n        PotentialMethod.M06_TZP,\n        PotentialMethod.M06_2X_TZP,\n        PotentialMethod.B3LYP_D_TZP,\n        PotentialMethod.KCIS_MODIFIED_SZ,\n        PotentialMethod.KCIS_ORIGINAL_SZ,\n        PotentialMethod.PKZB_SZ,\n        PotentialMethod.VS98_SZ,\n        PotentialMethod.LDA_VWN_SZ,\n        PotentialMethod.PW91_SZ,\n        PotentialMethod.BLYP_SZ,\n        PotentialMethod.BP_SZ,\n        PotentialMethod.PBE_SZ,\n        PotentialMethod.RPBE_SZ,\n        PotentialMethod.REVPBE_SZ,\n        PotentialMethod.OLYP_SZ,\n        PotentialMethod.FT97_SZ,\n        PotentialMethod.BLAP3_SZ,\n        PotentialMethod.HCTH_93_SZ,\n        PotentialMethod.HCTH_120_SZ,\n        PotentialMethod.HCTH_147_SZ,\n        PotentialMethod.HCTH_407_SZ,\n        PotentialMethod.BMTAU1_SZ,\n        PotentialMethod.BOP_SZ,\n        PotentialMethod.PKZBX_KCISCOR_SZ,\n        PotentialMethod.VS98_X_XC_SZ,\n        PotentialMethod.VS98_X_ONLY_SZ,\n        PotentialMethod.BECKE00_SZ,\n        PotentialMethod.BECKE00X_XC_SZ,\n        PotentialMethod.BECKE00_X_ONLY_SZ,\n        PotentialMethod.BECKE88X_BR89C_SZ,\n        PotentialMethod.OLAP3_SZ,\n        PotentialMethod.TPSS_SZ,\n        PotentialMethod.MPBE_SZ,\n        PotentialMethod.OPBE_SZ,\n        PotentialMethod.OPERDEW_SZ,\n        PotentialMethod.MPBEKCIS_SZ,\n        PotentialMethod.MPW_SZ,\n        PotentialMethod.TAU_HCTH_SZ,\n        PotentialMethod.XLYP_SZ,\n        PotentialMethod.KT1_SZ,\n        PotentialMethod.KT2_SZ,\n        PotentialMethod.M06_L_SZ,\n        PotentialMethod.BLYP_D_SZ,\n        PotentialMethod.BP86_D_SZ,\n        PotentialMethod.PBE_D_SZ,\n        PotentialMethod.TPSSD_SZ,\n        PotentialMethod.B97_D_SZ,\n        PotentialMethod.REVTPSS_SZ,\n        PotentialMethod.PBESOL_SZ,\n        PotentialMethod.RGE2_SZ,\n        PotentialMethod.SSB_D_SZ,\n        PotentialMethod.MVS_SZ,\n        PotentialMethod.MVSX_SZ,\n        PotentialMethod.TMGGA_SZ,\n        PotentialMethod.TPSSH_SZ,\n        PotentialMethod.B3LYP_VWN5_SZ,\n        PotentialMethod.O3LYP_VWN5_SZ,\n        PotentialMethod.KMLYP_VWN5_SZ,\n        PotentialMethod.PBE0_SZ,\n        PotentialMethod.B3LYP_S_VWN5_SZ,\n        PotentialMethod.BHANDH_SZ,\n        PotentialMethod.BHANDHLYP_SZ,\n        PotentialMethod.B97_SZ,\n        PotentialMethod.B97_1_SZ,\n        PotentialMethod.B97_2_SZ,\n        PotentialMethod.MPBE0KCIS_SZ,\n        PotentialMethod.MPBE1KCIS_SZ,\n        PotentialMethod.B1LYP_VWN5_SZ,\n        PotentialMethod.B1PW91_VWN5_SZ,\n        PotentialMethod.MPW1PW_SZ,\n        PotentialMethod.MPW1K_SZ,\n        PotentialMethod.TAU_HCTH_HYBRID_SZ,\n        PotentialMethod.X3LYP_VWN5_SZ,\n        PotentialMethod.OPBE0_SZ,\n        PotentialMethod.M05_SZ,\n        PotentialMethod.M05_2X_SZ,\n        PotentialMethod.M06_SZ,\n        PotentialMethod.M06_2X_SZ,\n        PotentialMethod.B3LYP_D_SZ,\n        PotentialMethod.GFN2_XTB,\n    ]\n\n    energy_target_names = [\n        \"KCIS-MODIFIED/DZP\",\n        \"KCIS-ORIGINAL/DZP\",\n        \"PKZB/DZP\",\n        \"VS98/DZP\",\n        \"LDA(VWN)/DZP\",\n        \"PW91/DZP\",\n        \"BLYP/DZP\",\n        \"BP/DZP\",\n        \"PBE/DZP\",\n        \"RPBE/DZP\",\n        \"REVPBE/DZP\",\n        \"OLYP/DZP\",\n        \"FT97/DZP\",\n        \"BLAP3/DZP\",\n        \"HCTH/93/DZP\",\n        \"HCTH/120/DZP\",\n        \"HCTH/147/DZP\",\n        \"HCTH/407/DZP\",\n        \"BMTAU1/DZP\",\n        \"BOP/DZP\",\n        \"PKZBX-KCISCOR/DZP\",\n        \"VS98-X(XC)/DZP\",\n        \"VS98-X-ONLY/DZP\",\n        \"BECKE00/DZP\",\n        \"BECKE00X(XC)/DZP\",\n        \"BECKE00-X-ONLY/DZP\",\n        \"BECKE88X+BR89C/DZP\",\n        \"OLAP3/DZP\",\n        \"TPSS/DZP\",\n        \"MPBE/DZP\",\n        \"OPBE/DZP\",\n        \"OPERDEW/DZP\",\n        \"MPBEKCIS/DZP\",\n        \"MPW/DZP\",\n        \"TAU-HCTH/DZP\",\n        \"XLYP/DZP\",\n        \"KT1/DZP\",\n        \"KT2/DZP\",\n        \"M06-L/DZP\",\n        \"BLYP-D/DZP\",\n        \"BP86-D/DZP\",\n        \"PBE-D/DZP\",\n        \"TPSS-D/DZP\",\n        \"B97-D/DZP\",\n        \"REVTPSS/DZP\",\n        \"PBESOL/DZP\",\n        \"RGE2/DZP\",\n        \"SSB-D/DZP\",\n        \"MVS/DZP\",\n        \"MVSX/DZP\",\n        \"T-MGGA/DZP\",\n        \"TPSSH/DZP\",\n        \"B3LYP(VWN5)/DZP\",\n        \"O3LYP(VWN5)/DZP\",\n        \"KMLYP(VWN5)/DZP\",\n        \"PBE0/DZP\",\n        \"B3LYP*(VWN5)/DZP\",\n        \"BHANDH/DZP\",\n        \"BHANDHLYP/DZP\",\n        \"B97/DZP\",\n        \"B97-1/DZP\",\n        \"B97-2/DZP\",\n        \"MPBE0KCIS/DZP\",\n        \"MPBE1KCIS/DZP\",\n        \"B1LYP(VWN5)/DZP\",\n        \"B1PW91(VWN5)/DZP\",\n        \"MPW1PW/DZP\",\n        \"MPW1K/DZP\",\n        \"TAU-HCTH-HYBRID/DZP\",\n        \"X3LYP(VWN5)/DZP\",\n        \"OPBE0/DZP\",\n        \"M05/DZP\",\n        \"M05-2X/DZP\",\n        \"M06/DZP\",\n        \"M06-2X/DZP\",\n        \"B3LYP-D/DZP\",\n        \"KCIS-MODIFIED/TZP\",\n        \"KCIS-ORIGINAL/TZP\",\n        \"PKZB/TZP\",\n        \"VS98/TZP\",\n        \"LDA(VWN)/TZP\",\n        \"PW91/TZP\",\n        \"BLYP/TZP\",\n        \"BP/TZP\",\n        \"PBE/TZP\",\n        \"RPBE/TZP\",\n        \"REVPBE/TZP\",\n        \"OLYP/TZP\",\n        \"FT97/TZP\",\n        \"BLAP3/TZP\",\n        \"HCTH/93/TZP\",\n        \"HCTH/120/TZP\",\n        \"HCTH/147/TZP\",\n        \"HCTH/407/TZP\",\n        \"BMTAU1/TZP\",\n        \"BOP/TZP\",\n        \"PKZBX-KCISCOR/TZP\",\n        \"VS98-X(XC)/TZP\",\n        \"VS98-X-ONLY/TZP\",\n        \"BECKE00/TZP\",\n        \"BECKE00X(XC)/TZP\",\n        \"BECKE00-X-ONLY/TZP\",\n        \"BECKE88X+BR89C/TZP\",\n        \"OLAP3/TZP\",\n        \"TPSS/TZP\",\n        \"MPBE/TZP\",\n        \"OPBE/TZP\",\n        \"OPERDEW/TZP\",\n        \"MPBEKCIS/TZP\",\n        \"MPW/TZP\",\n        \"TAU-HCTH/TZP\",\n        \"XLYP/TZP\",\n        \"KT1/TZP\",\n        \"KT2/TZP\",\n        \"M06-L/TZP\",\n        \"BLYP-D/TZP\",\n        \"BP86-D/TZP\",\n        \"PBE-D/TZP\",\n        \"TPSS-D/TZP\",\n        \"B97-D/TZP\",\n        \"REVTPSS/TZP\",\n        \"PBESOL/TZP\",\n        \"RGE2/TZP\",\n        \"SSB-D/TZP\",\n        \"MVS/TZP\",\n        \"MVSX/TZP\",\n        \"T-MGGA/TZP\",\n        \"TPSSH/TZP\",\n        \"B3LYP(VWN5)/TZP\",\n        \"O3LYP(VWN5)/TZP\",\n        \"KMLYP(VWN5)/TZP\",\n        \"PBE0/TZP\",\n        \"B3LYP*(VWN5)/TZP\",\n        \"BHANDH/TZP\",\n        \"BHANDHLYP/TZP\",\n        \"B97/TZP\",\n        \"B97-1/TZP\",\n        \"B97-2/TZP\",\n        \"MPBE0KCIS/TZP\",\n        \"MPBE1KCIS/TZP\",\n        \"B1LYP(VWN5)/TZP\",\n        \"B1PW91(VWN5)/TZP\",\n        \"MPW1PW/TZP\",\n        \"MPW1K/TZP\",\n        \"TAU-HCTH-HYBRID/TZP\",\n        \"X3LYP(VWN5)/TZP\",\n        \"OPBE0/TZP\",\n        \"M05/TZP\",\n        \"M05-2X/TZP\",\n        \"M06/TZP\",\n        \"M06-2X/TZP\",\n        \"B3LYP-D/TZP\",\n        \"KCIS-MODIFIED/SZ\",\n        \"KCIS-ORIGINAL/SZ\",\n        \"PKZB/SZ\",\n        \"VS98/SZ\",\n        \"LDA(VWN)/SZ\",\n        \"PW91/SZ\",\n        \"BLYP/SZ\",\n        \"BP/SZ\",\n        \"PBE/SZ\",\n        \"RPBE/SZ\",\n        \"REVPBE/SZ\",\n        \"OLYP/SZ\",\n        \"FT97/SZ\",\n        \"BLAP3/SZ\",\n        \"HCTH/93/SZ\",\n        \"HCTH/120/SZ\",\n        \"HCTH/147/SZ\",\n        \"HCTH/407/SZ\",\n        \"BMTAU1/SZ\",\n        \"BOP/SZ\",\n        \"PKZBX-KCISCOR/SZ\",\n        \"VS98-X(XC)/SZ\",\n        \"VS98-X-ONLY/SZ\",\n        \"BECKE00/SZ\",\n        \"BECKE00X(XC)/SZ\",\n        \"BECKE00-X-ONLY/SZ\",\n        \"BECKE88X+BR89C/SZ\",\n        \"OLAP3/SZ\",\n        \"TPSS/SZ\",\n        \"MPBE/SZ\",\n        \"OPBE/SZ\",\n        \"OPERDEW/SZ\",\n        \"MPBEKCIS/SZ\",\n        \"MPW/SZ\",\n        \"TAU-HCTH/SZ\",\n        \"XLYP/SZ\",\n        \"KT1/SZ\",\n        \"KT2/SZ\",\n        \"M06-L/SZ\",\n        \"BLYP-D/SZ\",\n        \"BP86-D/SZ\",\n        \"PBE-D/SZ\",\n        \"TPSS-D/SZ\",\n        \"B97-D/SZ\",\n        \"REVTPSS/SZ\",\n        \"PBESOL/SZ\",\n        \"RGE2/SZ\",\n        \"SSB-D/SZ\",\n        \"MVS/SZ\",\n        \"MVSX/SZ\",\n        \"T-MGGA/SZ\",\n        \"TPSSH/SZ\",\n        \"B3LYP(VWN5)/SZ\",\n        \"O3LYP(VWN5)/SZ\",\n        \"KMLYP(VWN5)/SZ\",\n        \"PBE0/SZ\",\n        \"B3LYP*(VWN5)/SZ\",\n        \"BHANDH/SZ\",\n        \"BHANDHLYP/SZ\",\n        \"B97/SZ\",\n        \"B97-1/SZ\",\n        \"B97-2/SZ\",\n        \"MPBE0KCIS/SZ\",\n        \"MPBE1KCIS/SZ\",\n        \"B1LYP(VWN5)/SZ\",\n        \"B1PW91(VWN5)/SZ\",\n        \"MPW1PW/SZ\",\n        \"MPW1K/SZ\",\n        \"TAU-HCTH-HYBRID/SZ\",\n        \"X3LYP(VWN5)/SZ\",\n        \"OPBE0/SZ\",\n        \"M05/SZ\",\n        \"M05-2X/SZ\",\n        \"M06/SZ\",\n        \"M06-2X/SZ\",\n        \"B3LYP-D/SZ\",\n        \"GFNXTB\",\n    ]\n\n    __energy_unit__ = \"ev\"  # to fix\n    __distance_unit__ = \"ang\"  # to fix\n    __forces_unit__ = \"ev/ang\"  # to fix\n    __links__ = {\n        \"xyz.zip\": \"https://data.dtu.dk/ndownloader/files/35143624\",\n        \"xtb.zip\": \"https://data.dtu.dk/ndownloader/files/42444300\",\n        \"dzp.zip\": \"https://data.dtu.dk/ndownloader/files/42443925\",\n        \"tzp.zip\": \"https://data.dtu.dk/ndownloader/files/42444129\",\n        \"sz.zip\": \"https://data.dtu.dk/ndownloader/files/42441345\",\n        \"failed_indices.dat\": \"https://data.dtu.dk/ndownloader/files/37337677\",\n    }\n\n    def _read_molecules_energies(self):\n        d = {\"DZP\": None, \"TZP\": None, \"SZ\": None, \"XTB\": None}\n        for basis in d.keys():\n            d[basis] = pd.read_csv(p_join(self.root, basis, \"molecules/molecules.csv\"), index_col=False).drop(\n                columns=[\"index\"]\n            )\n        return pd.concat([d[\"DZP\"], d[\"TZP\"], d[\"SZ\"], d[\"XTB\"]], axis=1, ignore_index=False)\n\n    def _read_all_xyzs(self):\n        xyz_list = read_xyz_files(self.root)\n        return pd.DataFrame(xyz_list)\n\n    def read_raw_entries(self):\n        df_energies = self._read_molecules_energies()\n        df_xyz = self._read_all_xyzs()\n        return [\n            {\"energies\": np.atleast_2d(en), **xyz_dict}\n            for xyz_dict, en in zip(df_xyz.to_dict(\"records\"), df_energies.values.astype(np.float64))\n        ]\n
    "},{"location":"API/datasets/nabladft.html","title":"NablaDFT","text":""},{"location":"API/datasets/nabladft.html#openqdc.datasets.potential.nabladft.NablaDFT","title":"NablaDFT","text":"

    Bases: BaseDataset

    NablaDFT is a dataset constructed from a subset of the Molecular Sets (MOSES) dataset consisting of 1 million molecules with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at wB97X-D/def2-XVP levels are used to generate the energy.

    Usage:

    from openqdc.datasets import NablaDFT\ndataset = NablaDFT()\n

    References

    https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D

    https://github.com/AIRI-Institute/nablaDFT

    Source code in openqdc/datasets/potential/nabladft.py
    class NablaDFT(BaseDataset):\n    \"\"\"\n    NablaDFT is a dataset constructed from a subset of the\n    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules\n    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of\n    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that\n    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.\n    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at\n    wB97X-D/def2-XVP levels are used to generate the energy.\n\n    Usage:\n    ```python\n    from openqdc.datasets import NablaDFT\n    dataset = NablaDFT()\n    ```\n\n    References:\n        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\\n\n        https://github.com/AIRI-Institute/nablaDFT\n    \"\"\"\n\n    __name__ = \"nabladft\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D_DEF2_SVP,\n    ]  # \"wb97x-d/def2-svp\"\n\n    energy_target_names = [\"wb97x-d/def2-svp\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"nabladft.db\": \"https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db\"}\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    @requires_package(\"nablaDFT\")\n    def read_raw_entries(self):\n        from nablaDFT.dataset import HamiltonianDatabase\n\n        label_path = p_join(self.root, \"summary.csv\")\n        df = pd.read_csv(label_path, usecols=[\"MOSES id\", \"CONFORMER id\", \"SMILES\", \"DFT TOTAL ENERGY\"])\n        labels = df.set_index(keys=[\"MOSES id\", \"CONFORMER id\"]).to_dict(\"index\")\n\n        raw_path = p_join(self.root, \"dataset_full.db\")\n        train = HamiltonianDatabase(raw_path)\n        n, c = len(train), 20\n        step_size = int(np.ceil(n / os.cpu_count()))\n\n        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)\n        samples = dm.parallelized(\n            fn, list(range(c)), n_jobs=c, progress=False, scheduler=\"threads\"\n        )  # don't use more than 1 job\n\n        return sum(samples, [])\n
    "},{"location":"API/datasets/orbnet_denali.html","title":"Orbnet Denali","text":""},{"location":"API/datasets/orbnet_denali.html#openqdc.datasets.potential.orbnet_denali.OrbnetDenali","title":"OrbnetDenali","text":"

    Bases: BaseDataset

    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of theory.

    Usage:

    from openqdc.datasets import OrbnetDenali\ndataset = OrbnetDenali()\n

    References

    https://arxiv.org/abs/2107.00299

    https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867

    Source code in openqdc/datasets/potential/orbnet_denali.py
    class OrbnetDenali(BaseDataset):\n    \"\"\"\n    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range\n    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and\n    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.\n    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer\n    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using\n    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of\n    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of\n    theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import OrbnetDenali\n    dataset = OrbnetDenali()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00299\\n\n        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867\n    \"\"\"\n\n    __name__ = \"orbnet_denali\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_DEF2_TZVP,\n        PotentialMethod.GFN1_XTB,\n    ]  # [\"wb97x-d3/def2-tzvp\", \"gfn1_xtb\"]\n    energy_target_names = [\"dft_energy\", \"xtb1_energy\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"orbnet_denali.tar.gz\": \"https://figshare.com/ndownloader/files/28672287\",\n        \"orbnet_denali_targets.tar.gz\": \"https://figshare.com/ndownloader/files/28672248\",\n    }\n\n    def read_raw_entries(self):\n        label_path = p_join(self.root, \"denali_labels.csv\")\n        df = pd.read_csv(label_path, usecols=[\"sample_id\", \"mol_id\", \"subset\", \"dft_energy\", \"xtb1_energy\"])\n        labels = {\n            mol_id: group.drop([\"mol_id\"], axis=1).drop_duplicates(\"sample_id\").set_index(\"sample_id\").to_dict(\"index\")\n            for mol_id, group in df.groupby(\"mol_id\")\n        }\n\n        fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)\n        res = dm.parallelized(fn, list(labels.items()), scheduler=\"threads\", n_jobs=-1, progress=True)\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/pcqm.html","title":"PCQM","text":""},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_B3LYP","title":"PCQM_B3LYP","text":"

    Bases: PCQM_PM6

    PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.

    Usage:

    from openqdc.datasets import PCQM_B3LYP\ndataset = PCQM_B3LYP()\n

    References

    https://arxiv.org/abs/2305.18454

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_B3LYP(PCQM_PM6):\n    \"\"\"\n    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to\n    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,\n    the electronic structure and properties are calculated using B3LIP/6-31G* method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_B3LYP\n    dataset = PCQM_B3LYP()\n    ```\n\n    References:\n        https://arxiv.org/abs/2305.18454\n    \"\"\"\n\n    __name__ = \"pubchemqc_b3lyp\"\n    __energy_methods__ = [\"b3lyp/6-31g*\"]\n    energy_target_names = [\"b3lyp\"]\n
    "},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_PM6","title":"PCQM_PM6","text":"

    Bases: BaseDataset

    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.

    Usage:

    from openqdc.datasets import PCQM_PM6\ndataset = PCQM_PM6()\n

    References

    https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_PM6(BaseDataset):\n    \"\"\"\n    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized\n    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less\n    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel\n    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also\n    computed using the PM6 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_PM6\n    dataset = PCQM_PM6()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740\n    \"\"\"\n\n    __name__ = \"pubchemqc_pm6\"\n    __energy_methods__ = [PotentialMethod.PM6]\n\n    energy_target_names = [\"pm6\"]\n\n    __force_methods__ = []\n    force_target_names = []\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"pubchemqc\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def collate_list(self, list_entries):\n        predicat = list_entries is not None and len(list_entries) > 0\n        list_entries = [x for x in list_entries if x is not None]\n        if predicat:\n            res = super().collate_list(list_entries)\n        else:\n            res = None\n        return res\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    def read_raw_entries(self):\n        arxiv_paths = glob(p_join(self.root, f\"{self.__energy_methods__[0]}\", \"*.pkl\"))\n        f = lambda x: self.collate_list(read_preprocessed_archive(x))\n        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)\n        samples = [x for x in samples if x is not None]\n        return samples\n\n    def preprocess(self, overwrite=False):\n        if overwrite or not self.is_preprocessed():\n            logger.info(\"Preprocessing data and saving it to cache.\")\n            logger.info(\n                f\"Dataset {self.__name__} data with the following units:\\n\"\n                f\"Energy: {self.energy_unit}, Distance: {self.distance_unit}, \"\n                f\"Forces: {self.force_unit if self.__force_methods__ else 'None'}\"\n            )\n            entries = self.read_raw_entries()\n            self.collate_and_save_list(entries)\n\n    def collate_and_save_list(self, list_entries):\n        n_molecules, n_atoms = 0, 0\n        for i in range(len(list_entries)):\n            list_entries[i][\"position_idx_range\"] += n_atoms\n            n_atoms += list_entries[i][\"position_idx_range\"].max()\n            n_molecules += list_entries[i][\"position_idx_range\"].shape[0]\n\n        for key in self.data_keys:\n            first = list_entries[0][key]\n            shape = (n_molecules, *first.shape[1:])\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\")\n            out = np.memmap(local_path, mode=\"w+\", dtype=first.dtype, shape=shape)\n\n            start = 0\n            for i in range(len(list_entries)):\n                x = list_entries[i].pop(key)\n                n = x.shape[0]\n                out[start : start + n] = x\n                out.flush()\n            push_remote(local_path, overwrite=True)\n\n        # save smiles and subset\n        tmp, n = dict(name=[]), len(list_entries)\n        local_path = p_join(self.preprocess_path, \"props.pkl\")\n        names = [list_entries[i].pop(\"name\") for i in range(n)]\n        f = lambda xs: [dm.to_inchikey(x) for x in xs]\n        res = dm.parallelized(f, names, n_jobs=-1, progress=False)\n        for x in res:\n            tmp[\"name\"] += x\n        for key in [\"subset\", \"n_atoms\"]:\n            tmp[key] = []\n            for i in range(n):\n                tmp[key] += list(list_entries[i].pop(key))\n        with open(local_path, \"wb\") as f:\n            pkl.dump(tmp, f)\n        push_remote(local_path, overwrite=True)\n
    "},{"location":"API/datasets/proteinfragments.html","title":"Protein Fragments","text":""},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.MDDataset","title":"MDDataset","text":"

    Bases: ProteinFragments

    MDDataset is a subset of the proteinfragments dataset that generated from the molecular dynamics with their model. The sampling was done with Molecular Dynamics at room temperature 300K in various solvent phase:

    Subsets

    Polyalanine: All the polyalanine are sampled in gas phase. AceAla15Lys is a polyalanine peptides capped with an N-terminal acetyl group and a protonated lysine residue at the C-terminus, Acela15nme is polyalanine peptide capped with an N-terminal acetyl group and a C-terminal N-methyl amide group

    Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)

    Usage:

    from openqdc.datasets import MDDataset\ndataset = MDDataset()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class MDDataset(ProteinFragments):\n    \"\"\"\n    MDDataset is a subset of the proteinfragments dataset that\n    generated from the molecular dynamics with their model.\n    The sampling was done with Molecular Dynamics\n    at room temperature 300K in various solvent phase:\n\n    Subsets:\n        Polyalanine:\n            All the polyalanine are sampled in gas phase. AceAla15Lys is\n            a polyalanine peptides capped with an N-terminal acetyl group\n            and a protonated lysine residue at the C-terminus,\n            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group\n            and a C-terminal N-methyl amide group\\n\n        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)\n\n    Usage:\n    ```python\n    from openqdc.datasets import MDDataset\n    dataset = MDDataset()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"mddataset\"\n\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"acala15nme_folding_clusters\", \"crambin\", \"minimahopping_acala15lysh\", \"minimahopping_acala15nme\"]\n    }\n
    "},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.ProteinFragments","title":"ProteinFragments","text":"

    Bases: BaseDataset

    ProteinFragments is a dataset constructed from a subset of the the data was generated from a top-down and bottom-up approach:

    Top-down

    Fragments are generated by cutting out a spherical region around an atom (including solvent molecules) and saturating all dangling bonds. Sampling was done with the Molecular Dynamics (MD) method from conventional FF at room temperature.

    Bottom-up

    Fragments are generated by constructing chemical graphs of one to eight nonhydrogen atoms. Sampling of multiple conformers per fragments was done with MD simulations at high temperatures or normal mode sampling.

    Usage:

    from openqdc.datasets import ProteinFragments\ndataset = ProteinFragments()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class ProteinFragments(BaseDataset):\n    \"\"\"\n    ProteinFragments is a dataset constructed from a subset of the\n    the data was generated from a top-down and bottom-up approach:\n\n    Top-down:\n        Fragments are generated by cutting out a spherical\n        region around an atom (including solvent molecules)\n        and saturating all dangling bonds.\n        Sampling was done with the Molecular Dynamics (MD) method from\n        conventional FF at room temperature.\n\n    Bottom-up:\n        Fragments are generated by constructing chemical graphs\n        of one to eight nonhydrogen atoms.\n        Sampling of multiple conformers per fragments was done with\n        MD simulations at high temperatures or normal mode sampling.\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import ProteinFragments\n    dataset = ProteinFragments()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"proteinfragments\"\n    # PBE0/def2-TZVPP+MBD\n    __energy_methods__ = [\n        PotentialMethod.PBE0_MBD_DEF2_TZVPP,\n    ]\n\n    energy_target_names = [\n        \"PBE0+MBD/def2-TZVPP\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"general_protein_fragments\"]\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"proteinfragments\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"proteinfragments\", links=self.__links__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.extend(read_db(raw_path))\n        return samples\n
    "},{"location":"API/datasets/qm1b.html","title":"QM1B","text":""},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B","title":"QM1B","text":"

    Bases: BaseDataset

    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.

    Usage:

    from openqdc.datasets import QM1B\ndataset = QM1B()\n

    References

    https://arxiv.org/pdf/2311.01135

    https://github.com/graphcore-research/qm1b-dataset/

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B(BaseDataset):\n    \"\"\"\n    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom\n    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are\n    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.\n    Electronic properties for each conformation are then calculated using the density functional B3LYP\n    and the basis set STO-3G.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B\n    dataset = QM1B()\n    ```\n\n    References:\n        https://arxiv.org/pdf/2311.01135\\n\n        https://github.com/graphcore-research/qm1b-dataset/\n    \"\"\"\n\n    __name__ = \"qm1b\"\n\n    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]\n    __force_methods__ = []\n\n    energy_target_names = [\"b3lyp/sto-3g\"]\n    force_target_names = []\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"ev/bohr\"\n    __links__ = {\n        \"qm1b_validation.parquet\": \"https://ndownloader.figshare.com/files/43005175\",\n        **{f\"part_{i:03d}.parquet\": f\"https://ndownloader.figshare.com/files/{FILE_NUM[i]}\" for i in range(0, 256)},\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qm1b\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        filenames = list(map(lambda x: p_join(self.root, f\"part_{x:03d}.parquet\"), list(range(0, 256)))) + [\n            p_join(self.root, \"qm1b_validation.parquet\")\n        ]\n\n        def read_entries_parallel(filename):\n            df = pd.read_parquet(filename)\n\n            def extract_parallel(df, i):\n                return extract_from_row(df.iloc[i])\n\n            fn = partial(extract_parallel, df)\n            list_of_idxs = list(range(len(df)))\n            results = dm.utils.parallelized(fn, list_of_idxs, scheduler=\"threads\", progress=False)\n            return results\n\n        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler=\"processes\", progress=True)\n\n        return [x for xs in list_of_list for x in xs]\n
    "},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B_SMALL","title":"QM1B_SMALL","text":"

    Bases: QM1B

    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.

    Usage:

    from openqdc.datasets import QM1B_SMALL\ndataset = QM1B_SMALL()\n

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B_SMALL(QM1B):\n    \"\"\"\n    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B_SMALL\n    dataset = QM1B_SMALL()\n    ```\n    \"\"\"\n\n    __name__ = \"qm1b_small\"\n
    "},{"location":"API/datasets/qm7x.html","title":"QM7X","text":""},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X","title":"QM7X","text":"

    Bases: BaseDataset

    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.

    Usage:

    from openqdc.datasets import QM7X\ndataset = QM7X()\n

    References

    https://arxiv.org/abs/2006.15139

    https://zenodo.org/records/4288677

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X(BaseDataset):\n    \"\"\"\n    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with\n    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,\n    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-\n    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure\n    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)\n    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non\n    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of\n    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has\n    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X\n    dataset = QM7X()\n    ```\n\n    References:\n        https://arxiv.org/abs/2006.15139\\n\n        https://zenodo.org/records/4288677\n    \"\"\"\n\n    __name__ = \"qm7x\"\n\n    __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # \"pbe0/def2-tzvp\", \"dft3b\"]\n\n    energy_target_names = [\"ePBE0+MBD\", \"eDFTB+MBD\"]\n\n    __force_mask__ = [True, False]\n\n    force_target_names = [\"pbe0FOR\"]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {f\"{i}000.xz\": f\"https://zenodo.org/record/4288677/files/{i}000.xz\" for i in range(1, 9)}\n\n    def read_raw_entries(self):\n        samples = []\n        for i in range(1, 9):\n            raw_path = p_join(self.root, f\"{i}000\")\n            data = load_hdf5_file(raw_path)\n            samples += [\n                read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys())\n            ]\n\n        return samples\n
    "},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X_V2","title":"QM7X_V2","text":"

    Bases: QM7X

    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QM7X_V2\ndataset = QM7X_V2()\n

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X_V2(QM7X):\n    \"\"\"\n    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X_V2\n    dataset = QM7X_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qm7x_v2\"\n    __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]\n    __force_mask__ = QM7X.__force_mask__ + [False]\n    energy_target_names = QM7X.energy_target_names + [\"PM6\"]\n    force_target_names = QM7X.force_target_names\n
    "},{"location":"API/datasets/qmugs.html","title":"Qmugs","text":""},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs","title":"QMugs","text":"

    Bases: BaseDataset

    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).

    Usage:

    from openqdc.datasets import QMugs\ndataset = QMugs()\n

    References

    https://arxiv.org/abs/2107.00367

    https://www.nature.com/articles/s41597-022-01390-7#ethics

    https://www.research-collection.ethz.ch/handle/20.500.11850/482129

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs(BaseDataset):\n    \"\"\"\n    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules\n    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB\n    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical\n    method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs\n    dataset = QMugs()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00367\\n\n        https://www.nature.com/articles/s41597-022-01390-7#ethics\\n\n        https://www.research-collection.ethz.ch/handle/20.500.11850/482129\n    \"\"\"\n\n    __name__ = \"qmugs\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP]  # \"gfn2_xtb\", \"wb97x-d/def2-svp\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"summary.csv\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv\",\n        \"structures.tar.gz\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz\",  # noqa\n    }\n\n    energy_target_names = [\n        \"GFN2:TOTAL_ENERGY\",\n        \"DFT:TOTAL_ENERGY\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"structures\")\n        mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]\n\n        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler=\"threads\")\n        return samples\n
    "},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs_V2","title":"QMugs_V2","text":"

    Bases: QMugs

    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QMugs_V2\ndataset = QMugs_V2()\n

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs_V2(QMugs):\n    \"\"\"\n    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs_V2\n    dataset = QMugs_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qmugs_v2\"\n    __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]\n    energy_target_names = QMugs.energy_target_names + [\"PM6\"]\n    __force_mask__ = QMugs.__force_mask__ + [False]\n
    "},{"location":"API/datasets/qmx.html","title":"QMX","text":""},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7","title":"QM7","text":"

    Bases: QMX

    QM7 is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, H]

    Usage:

    from openqdc.datasets import QM7\ndataset = QM7()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7(QMX):\n    \"\"\"\n    QM7 is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7\n    dataset = QM7()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7.hdf5.gz\": \"https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1\"}\n    __name__ = \"qm7\"\n\n    energy_target_names = [\n        \"B2PLYP-D3(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3(BJ):def2-svp\",\n        \"B2PLYP-D3(BJ):def2-tzvp\",\n        \"B2PLYP-D3(BJ):sto-3g\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"MP2:aug-cc-pvdz\",\n        \"MP2:aug-cc-pvtz\",\n        \"MP2:def2-svp\",\n        \"MP2:def2-tzvp\",\n        \"MP2:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7b","title":"QM7b","text":"

    Bases: QMX

    QM7b is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, Cl, H]

    Usage:

    from openqdc.datasets import QM7b\ndataset = QM7b()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7b(QMX):\n    \"\"\"\n    QM7b is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, Cl, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7b\n    dataset = QM7b()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7b.hdf5.gz\": \"https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1\"}\n    __name__ = \"qm7b\"\n    energy_target_names = [\n        \"CCSD(T0):cc-pVDZ\",\n        \"HF:cc-pVDZ\",\n        \"HF:cc-pVTZ\",\n        \"MP2:cc-pVTZ\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM8","title":"QM8","text":"

    Bases: QMX

    QM8 is the subset of QM9 used in a study on modeling quantum mechanical calculations of electronic spectra and excited state energy (a increase of energy from the ground states) of small molecules up to eight heavy atoms. Multiple methods were used, including time-dependent density functional theories (TDDFT) and second-order approximate coupled-cluster (CC2). The molecules conformations are relaxed geometries computed using the DFT B3LYP with basis set 6-31G(2df,p). For more information about the sampling, check QM9 dataset.

    Usage:

    from openqdc.datasets import QM8\ndataset = QM8()\n

    References

    https://arxiv.org/pdf/1504.01966

    Source code in openqdc/datasets/potential/qmx.py
    class QM8(QMX):\n    \"\"\"QM8 is the subset of QM9 used in a study on modeling quantum\n    mechanical calculations of electronic spectra and excited\n    state energy (a increase of energy from the ground states) of small molecules\n    up to eight heavy atoms.\n    Multiple methods were used, including\n    time-dependent density functional theories (TDDFT) and\n    second-order approximate coupled-cluster (CC2).\n    The molecules conformations are relaxed geometries computed using\n    the DFT B3LYP with basis set 6-31G(2df,p).\n    For more information about the sampling, check QM9 dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM8\n    dataset = QM8()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1504.01966\n    \"\"\"\n\n    __name__ = \"qm8\"\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n\n    __links__ = {\n        \"qm8.csv\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv\",\n        \"qm8.tar.gz\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz\",\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"qm8.csv\"))\n        mols = dm.read_sdf(p_join(self.root, \"qm8.sdf\"), sanitize=False, remove_hs=False)\n        samples = []\n        for idx_row, mol in zip(df.iterrows(), mols):\n            _, row = idx_row\n            positions = mol.GetConformer().GetPositions()\n            x = get_atomic_number_and_charge(mol)\n            n_atoms = positions.shape[0]\n            samples.append(\n                dict(\n                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),\n                    name=np.array([row[\"smiles\"]]),\n                    energies=np.array(\n                        [\n                            row[\n                                [\"E1-CC2\", \"E2-CC2\", \"E1-PBE0\", \"E2-PBE0\", \"E1-PBE0.1\", \"E2-PBE0.1\", \"E1-CAM\", \"E2-CAM\"]\n                            ].tolist()\n                        ],\n                        dtype=np.float64,\n                    ).reshape(1, -1),\n                    n_atoms=np.array([n_atoms], dtype=np.int32),\n                    subset=np.array([f\"{self.__name__}\"]),\n                )\n            )\n        return samples\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM9","title":"QM9","text":"

    Bases: QMX

    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed by relaxing geometries with quantum mechanical method B3LYP.

    Usage:

    from openqdc.datasets import QM9\ndataset = QM9()\n

    Reference

    https://www.nature.com/articles/sdata201422

    Source code in openqdc/datasets/potential/qmx.py
    class QM9(QMX):\n    \"\"\"\n    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,\n    containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)\n    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed\n    by relaxing geometries with quantum mechanical method B3LYP.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM9\n    dataset = QM9()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/sdata201422\n    \"\"\"\n\n    __links__ = {\"qm9.hdf5.gz\": \"https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1\"}\n    __name__ = \"qm9\"\n    energy_target_names = [\n        \"Internal energy at 0 K\",\n        \"B3LYP:def2-svp\",\n        \"HF:cc-pvtz\",\n        \"HF:sto-3g\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n    ]\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QMX","title":"QMX","text":"

    Bases: ABC, BaseDataset

    QMX dataset base abstract class

    Source code in openqdc/datasets/potential/qmx.py
    class QMX(ABC, BaseDataset):\n    \"\"\"\n    QMX dataset base abstract class\n    \"\"\"\n\n    __name__ = \"qm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qmx\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"qmx\", links=self.__links__)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)\n        return samples\n
    "},{"location":"API/datasets/revmd17.html","title":"RevMD17","text":""},{"location":"API/datasets/revmd17.html#openqdc.datasets.potential.revmd17.RevMD17","title":"RevMD17","text":"

    Bases: BaseDataset

    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration grid. The dataset contains the following molecules: Benzene: 627000 samples

    Uracil: 133000 samples\n\nNaptalene: 326000 samples\n\nAspirin: 211000 samples\n\nSalicylic Acid: 320000 samples\n\nMalonaldehyde: 993000 samples\n\nEthanol: 555000 samples\n\nToluene: 100000 samples\n

    Usage:

    from openqdc.datasets import RevMD17\ndataset = RevMD17()\n

    References

    https://arxiv.org/abs/2007.09593

    Source code in openqdc/datasets/potential/revmd17.py
    class RevMD17(BaseDataset):\n    \"\"\"\n    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original\n    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies\n    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration\n    grid. The dataset contains the following molecules:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import RevMD17\n    dataset = RevMD17()\n    ```\n\n    References:\n        https://arxiv.org/abs/2007.09593\n    \"\"\"\n\n    __name__ = \"revmd17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP\n        # \"pbe/def2-tzvp\",\n    ]\n    __force_mask__ = [True]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_methods__ = [\n        \"pbe/def2-tzvp\",\n    ]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n    __links__ = {\"revmd17.zip\": \"https://figshare.com/ndownloader/articles/12672038/versions/3\"}\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    def read_raw_entries(self):\n        entries_list = []\n        decompress_tar_gz(p_join(self.root, \"rmd17.tar.bz2\"))\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/sn2_rxn.html","title":"SN2 RXN","text":""},{"location":"API/datasets/sn2_rxn.html#openqdc.datasets.potential.sn2_rxn.SN2RXN","title":"SN2RXN","text":"

    Bases: BaseDataset

    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset contains 452,709 structures along with the energy, force and dipole moments.

    Usage:

    from openqdc.datasets import SN2RXN\ndataset = SN2RXN()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605341

    Source code in openqdc/datasets/potential/sn2_rxn.py
    class SN2RXN(BaseDataset):\n    \"\"\"\n    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X +  Y-, and\n    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by\n    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment\n    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and\n    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset\n    contains 452,709 structures along with the energy, force and dipole moments.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SN2RXN\n    dataset = SN2RXN()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605341\n    \"\"\"\n\n    __name__ = \"sn2_rxn\"\n\n    __energy_methods__ = [\n        PotentialMethod.DSD_BLYP_D3_BJ_DEF2_TZVP\n        # \"dsd-blyp-d3(bj)/def2-tzvp\",\n    ]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"sn2_rxn.npz\": \"https://zenodo.org/records/2605341/files/sn2_reactions.npz\"}\n\n    energy_target_names = [\n        # TODO: We need to revalidate this to make sure that is not atomization energies.\n        \"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"DSD-BLYP-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"sn2_rxn.npz\")\n        data = np.load(raw_path)\n        samples = extract_npz_entry(data)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html","title":"Solvated Peptides","text":""},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides","title":"SolvatedPeptides","text":"

    Bases: BaseDataset

    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\" and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.

    Usage:

    from openqdc.datasets import SolvatedPeptides\ndataset = SolvatedPeptides()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605372

    Source code in openqdc/datasets/potential/solvated_peptides.py
    class SolvatedPeptides(BaseDataset):\n    \"\"\"\n    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\"\n    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are\n    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10\n    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SolvatedPeptides\n    dataset = SolvatedPeptides()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605372\n    \"\"\"\n\n    __name__ = \"solvated_peptides\"\n\n    __energy_methods__ = [\n        PotentialMethod.REVPBE_D3_BJ_DEF2_TZVP\n        # \"revpbe-d3(bj)/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    # TO CHECK\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"solvated_peptides.hdf5.gz\": \"https://zenodo.org/record/3585804/files/213.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"solvated_peptides.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"solvated_peptides\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/solvated_peptides.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/spice.html","title":"Spice","text":""},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.Spice","title":"Spice","text":"

    Bases: BaseDataset

    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the wB97M-D3(BJ)/def2-TZVPPD level of theory.

    Usage:

    from openqdc.datasets import Spice\ndataset = Spice()\n

    References

    https://arxiv.org/abs/2209.10702

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class Spice(BaseDataset):\n    \"\"\"\n    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of\n    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,\n    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate\n    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and\n    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the\n    wB97M-D3(BJ)/def2-TZVPPD level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Spice\n    dataset = Spice()\n    ```\n\n    References:\n        https://arxiv.org/abs/2209.10702\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice\"\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n\n    energy_target_names = [\"dft_total_energy\"]\n\n    force_target_names = [\"dft_total_gradient\"]\n\n    subset_mapping = {\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Dipeptides Single Points Dataset v1.2\": \"Dipeptides\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.0\": \"DES370K Dimers\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE Ion Pairs Single Points Dataset v1.1\": \"Ion Pairs\",\n    }\n    __links__ = {\"SPICE-1.1.4.hdf5\": \"https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5\"}\n\n    def convert_forces(self, x):\n        return (-1.0) * super().convert_forces(x)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"SPICE-1.1.4.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        tmp = [read_record(data[mol_name], self) for mol_name in tqdm(data)]  # don't use parallelized here\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceV2","title":"SpiceV2","text":"

    Bases: Spice

    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.

    Usage:

    from openqdc.datasets import SpiceV2\ndataset = SpiceV2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceV2(Spice):\n    \"\"\"\n    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.\n    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain\n    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve\n    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and\n    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceV2\n    dataset = SpiceV2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spicev2\"\n\n    subset_mapping = {\n        \"SPICE Dipeptides Single Points Dataset v1.3\": \"Dipeptides\",\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Water Clusters v1.0\": \"Water Clusters\",\n        \"SPICE Solvated PubChem Set 1 v1.0\": \"Solvated PubChem\",\n        \"SPICE Amino Acid Ligand v1.0\": \"Amino Acid Ligand\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 7 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 8 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 9 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 10 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.1\": \"DES370K Dimers\",\n        \"SPICE PubChem Boron Silicon v1.0\": \"PubChem Boron Silicon\",\n        \"SPICE Ion Pairs Single Points Dataset v1.2\": \"Ion Pairs\",\n    }\n    __links__ = {\"spice-2.0.0.hdf5\": \"https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"spice-2.0.0.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        # Entry 40132 without positions, skip it\n        # don't use parallelized here\n        tmp = [read_record(data[mol_name], self) for i, mol_name in enumerate(tqdm(data)) if i != 40132]\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceVL2","title":"SpiceVL2","text":"

    Bases: SpiceV2

    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.

    Usage:

    from openqdc.datasets import SpiceVL2\ndataset = SpiceVL2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceVL2(SpiceV2):\n    \"\"\"\n    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceVL2\n    dataset = SpiceVL2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice_vl2\"\n\n    __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]\n    energy_target_names = SpiceV2.energy_target_names + [\"GFN2,\" \"PM6\"]\n    __force_mask__ = SpiceV2.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.read_record","title":"read_record(r, obj)","text":"

    Read record from hdf5 file. r : hdf5 record obj : Spice class object used to grab subset and names

    Source code in openqdc/datasets/potential/spice.py
    def read_record(r, obj):\n    \"\"\"\n    Read record from hdf5 file.\n        r : hdf5 record\n        obj : Spice class object used to grab subset and names\n    \"\"\"\n    smiles = r[\"smiles\"].asstr()[0]\n    subset = r[\"subset\"][0].decode(\"utf-8\")\n    n_confs = r[\"conformations\"].shape[0]\n    x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))\n    positions = r[\"conformations\"][:]\n\n    res = dict(\n        name=np.array([smiles] * n_confs),\n        subset=np.array([obj.subset_mapping[subset]] * n_confs),\n        energies=r[obj.energy_target_names[0]][:][:, None].astype(np.float64),\n        forces=r[obj.force_target_names[0]][:].reshape(\n            -1, 3, 1\n        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method\n        atomic_inputs=np.concatenate(\n            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32\n        ).reshape(-1, 5),\n        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/splinter.html","title":"Splinter","text":""},{"location":"API/datasets/splinter.html#openqdc.datasets.interaction.splinter.Splinter","title":"Splinter","text":"

    Bases: BaseInteractionDataset

    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.

    Usage:

    from openqdc.datasets import Splinter\ndataset = Splinter()\n

    Reference

    https://doi.org/10.1038/s41597-023-02443-1

    Source code in openqdc/datasets/interaction/splinter.py
    class Splinter(BaseInteractionDataset):\n    \"\"\"\n    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated\n    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies\n    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Splinter\n    dataset = Splinter()\n    ```\n\n    Reference:\n        https://doi.org/10.1038/s41597-023-02443-1\n    \"\"\"\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __name__ = \"splinter\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        # \"sapt0/jun-cc-pV(D+d)Z_unscaled\", #TODO: we need to pick the unscaled version only here\n        # \"sapt0/jun-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_scaled\",\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = []\n    __links__ = {\n        \"dimerpairs.0.tar.gz\": \"https://figshare.com/ndownloader/files/39449167\",\n        \"dimerpairs.1.tar.gz\": \"https://figshare.com/ndownloader/files/40271983\",\n        \"dimerpairs.2.tar.gz\": \"https://figshare.com/ndownloader/files/40271989\",\n        \"dimerpairs.3.tar.gz\": \"https://figshare.com/ndownloader/files/40272001\",\n        \"dimerpairs.4.tar.gz\": \"https://figshare.com/ndownloader/files/40272022\",\n        \"dimerpairs.5.tar.gz\": \"https://figshare.com/ndownloader/files/40552931\",\n        \"dimerpairs.6.tar.gz\": \"https://figshare.com/ndownloader/files/40272040\",\n        \"dimerpairs.7.tar.gz\": \"https://figshare.com/ndownloader/files/40272052\",\n        \"dimerpairs.8.tar.gz\": \"https://figshare.com/ndownloader/files/40272061\",\n        \"dimerpairs.9.tar.gz\": \"https://figshare.com/ndownloader/files/40272064\",\n        \"dimerpairs_nonstandard.tar.gz\": \"https://figshare.com/ndownloader/files/40272067\",\n        \"lig_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272070\",\n        \"lig_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272073\",\n        \"prot_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272076\",\n        \"prot_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272079\",\n        \"merge_monomers.py\": \"https://figshare.com/ndownloader/files/41807682\",\n    }\n\n    def read_raw_entries(self) -> List[Dict]:\n        logger.info(f\"Reading Splinter interaction data from {self.root}\")\n        data = []\n        i = 0\n        with tqdm(total=1680022) as progress_bar:\n            for root, dirs, files in os.walk(self.root):  # total is currently an approximation\n                for filename in files:\n                    if not filename.endswith(\".xyz\"):\n                        continue\n                    i += 1\n                    filepath = os.path.join(root, filename)\n                    filein = open(filepath, \"r\")\n                    lines = list(map(lambda x: x.strip(), filein.readlines()))\n                    n_atoms = np.array([int(lines[0])], dtype=np.int32)\n                    metadata = lines[1].split(\",\")\n                    try:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            r,\n                            theta_P,\n                            tau_P,\n                            theta_L,\n                            tau_L,\n                            tau_PL,\n                        ) = metadata[0].split(\"_\")\n                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(\n                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])\n                        )\n                    except ValueError:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            _,\n                        ) = metadata[0].split(\"_\")\n                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6\n                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)\n                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)\n                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))\n                    lines = list(map(lambda x: x.split(), lines[2:]))\n                    pos = np.array(lines)[:, 1:].astype(np.float32)\n                    elems = np.array(lines)[:, 0]\n                    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)\n                    natoms0 = n_atoms_ptr[0]\n                    natoms1 = n_atoms[0] - natoms0\n                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)\n                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)\n                    subset = np.array([root.split(\"/\")[-1]])\n\n                    item = dict(\n                        energies=energies,\n                        subset=subset,\n                        n_atoms=n_atoms,\n                        n_atoms_ptr=n_atoms_ptr,\n                        atomic_inputs=atomic_inputs,\n                        protein_monomer_name=np.array([protein_monomer_name]),\n                        protein_interaction_site_type=np.array([protein_interaction_site_type]),\n                        ligand_monomer_name=np.array([ligand_monomer_name]),\n                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),\n                        index=np.array([index], dtype=np.float32),\n                        r=np.array([r], dtype=np.float32),\n                        theta_P=np.array([theta_P], dtype=np.float32),\n                        tau_P=np.array([tau_P], dtype=np.float32),\n                        theta_L=np.array([theta_L], dtype=np.float32),\n                        tau_L=np.array([tau_L], dtype=np.float32),\n                        tau_PL=np.array([tau_PL], dtype=np.float32),\n                        name=np.array([protein_monomer_name + \".\" + ligand_monomer_name]),\n                    )\n                    data.append(item)\n                    progress_bar.update(1)\n        logger.info(f\"Processed {i} files in total\")\n        return data\n
    "},{"location":"API/datasets/tmqm.html","title":"TMQM","text":""},{"location":"API/datasets/tmqm.html#openqdc.datasets.potential.tmqm.TMQM","title":"TMQM","text":"

    Bases: BaseDataset

    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database and then optimized in gas phase with the extended tight-binding GFN2-xTB method.

    Usage:

    from openqdc.datasets import TMQM\ndataset = TMQM()\n

    References

    https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041

    https://github.com/bbskjelstad/tmqm

    Source code in openqdc/datasets/potential/tmqm.py
    class TMQM(BaseDataset):\n    \"\"\"\n    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of\n    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated\n    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database\n    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import TMQM\n    dataset = TMQM()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\\n\n        https://github.com/bbskjelstad/tmqm\n    \"\"\"\n\n    __name__ = \"tmqm\"\n\n    __energy_methods__ = [PotentialMethod.TPSSH_DEF2_TZVP]  # \"tpssh/def2-tzvp\"]\n\n    energy_target_names = [\"TPSSh/def2TZVP level\"]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        x: f\"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}\"\n        for x in [\"tmQM_X1.xyz.gz\", \"tmQM_X2.xyz.gz\", \"tmQM_y.csv\", \"Benchmark2_TPSSh_Opt.xyz\"]\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"tmQM_y.csv\"), sep=\";\", usecols=[\"CSD_code\", \"Electronic_E\"])\n        e_map = dict(zip(df[\"CSD_code\"], df[\"Electronic_E\"]))\n        raw_fnames = [\"tmQM_X1.xyz\", \"tmQM_X2.xyz\", \"Benchmark2_TPSSh_Opt.xyz\"]\n        samples = []\n        for fname in raw_fnames:\n            data = read_xyz(p_join(self.root, fname), e_map)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/transition1x.html","title":"Transition1X","text":""},{"location":"API/datasets/transition1x.html#openqdc.datasets.potential.transition1x.Transition1X","title":"Transition1X","text":"

    Bases: BaseDataset

    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and the transition states are generated by running Nudged Elastic Band (NEB) with DFT.

    Usage:

    from openqdc.datasets import Transition1X\ndataset = Transition1X()\n

    References: - https://www.nature.com/articles/s41597-022-01870-w

    • https://gitlab.com/matschreiner/Transition1x
    Source code in openqdc/datasets/potential/transition1x.py
    class Transition1X(BaseDataset):\n    \"\"\"\n    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy\n    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and\n    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Transition1X\n    dataset = Transition1X()\n    ```\n\n    References:\n    - https://www.nature.com/articles/s41597-022-01870-w\\n\n    - https://gitlab.com/matschreiner/Transition1x\\n\n    \"\"\"\n\n    __name__ = \"transition1x\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D\n        # \"wb97x/6-31G(d)\",\n    ]\n\n    energy_target_names = [\n        \"wB97x_6-31G(d).energy\",\n    ]\n\n    __force_mask__ = [True]\n    force_target_names = [\n        \"wB97x_6-31G(d).forces\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"Transition1x.h5\": \"https://figshare.com/ndownloader/files/36035789\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"Transition1x.h5\")\n        f = load_hdf5_file(raw_path)[\"data\"]\n\n        res = sum([read_record(f[g], group=g) for g in tqdm(f)], [])  # don't use parallelized here\n        return res\n
    "},{"location":"API/datasets/vqm24.html","title":"VQM24","text":""},{"location":"API/datasets/vqm24.html#openqdc.datasets.potential.vqm24.VQM24","title":"VQM24","text":"

    Bases: BaseDataset

    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.

    Usage:

    from openqdc.datasets import VQM24\ndataset = VQM24()\n

    Reference

    https://arxiv.org/abs/2405.05961

    Source code in openqdc/datasets/potential/vqm24.py
    class VQM24(BaseDataset):\n    \"\"\"\n    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical\n    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional\n    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and\n    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.\n\n    Usage:\n    ```python\n    from openqdc.datasets import VQM24\n    dataset = VQM24()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2405.05961\n    \"\"\"\n\n    __name__ = \"vqm24\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_CC_PVDZ,  # \"wB97x-D3/cc-pVDZ.\"\n    ]\n\n    energy_target_names = [\n        \"wB97x-D3/cc-pVDZ\",\n    ]\n    # \u03c9B97X-D3/cc-pVDZ\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        f\"{name}.npz\": f\"https://zenodo.org/records/11164951/files/{name}.npz?download=1\"\n        for name in [\"DFT_all\", \"DFT_saddles\", \"DFT_uniques\", \"DMC\"]\n    }\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.append(read_npz_entry(raw_path))\n        return samples\n
    "},{"location":"API/datasets/waterclusters.html","title":"SCAN Waterclusters","text":""},{"location":"API/datasets/waterclusters.html#openqdc.datasets.potential.waterclusters.SCANWaterClusters","title":"SCANWaterClusters","text":"

    Bases: BaseDataset

    The SCAN Water Clusters dataset contains conformations of neutral water clusters containing up to 20 monomers, charged water clusters, and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212. Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics simulations using AMBER 9 and optimized to obtain lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.

    Chemical Species

    [H, O, Li, Na, K, F, Cl, Br]

    Usage:

    from openqdc.datasets import SCANWaterClusters\ndataset = SCANWaterClusters()\n

    References

    https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec

    https://github.com/esoteric-ephemera/water_cluster_density_errors

    Source code in openqdc/datasets/potential/waterclusters.py
    class SCANWaterClusters(BaseDataset):\n    \"\"\"\n    The SCAN Water Clusters dataset contains conformations of\n    neutral water clusters containing up to 20 monomers, charged water clusters,\n    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:\n    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14\n    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of\n    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212.\n    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics\n    simulations using AMBER 9 and optimized to obtain\n    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.\n\n\n    Chemical Species:\n        [H, O, Li, Na, K, F, Cl, Br]\n\n    Usage:\n    ```python\n    from openqdc.datasets import SCANWaterClusters\n    dataset = SCANWaterClusters()\n    ```\n\n    References:\n        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\\n\n        https://github.com/esoteric-ephemera/water_cluster_density_errors\n    \"\"\"\n\n    __name__ = \"scanwaterclusters\"\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    energy_target_names = [\n        \"HF\",\n        \"HF-r2SCAN-DC4\",\n        \"SCAN\",\n        \"SCAN@HF\",\n        \"SCAN@r2SCAN50\",\n        \"r2SCAN\",\n        \"r2SCAN@HF\",\n        \"r2SCAN@r2SCAN50\",\n        \"r2SCAN50\",\n        \"r2SCAN100\",\n        \"r2SCAN10\",\n        \"r2SCAN20\",\n        \"r2SCAN25\",\n        \"r2SCAN30\",\n        \"r2SCAN40\",\n        \"r2SCAN60\",\n        \"r2SCAN70\",\n        \"r2SCAN80\",\n        \"r2SCAN90\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]\n    force_target_names = []\n    # 27            # 9 level\n    subsets = [\"BEGDB_H2O\", \"WATER27\", \"H2O_alkali_clusters\", \"H2O_halide_clusters\"]\n    __links__ = {\n        \"geometries.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True\",  # noqa\n        \"total_energies.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True\",  # noqa\n    }\n\n    def read_raw_entries(self):\n        entries = []  # noqa\n        for i, subset in enumerate(self.subsets):\n            geometries = read_geometries(p_join(self.root, \"geometries.json.gz\"), subset)\n            energies = read_energies(p_join(self.root, \"total_energies.json.gz\"), subset)\n            datum = {}\n            for k in energies:\n                _ = energies[k].pop(\"metadata\")\n                datum[k] = energies[k][\"total_energies\"]\n            entries.extend(format_geometry_and_entries(geometries, datum, subset))\n        return entries\n
    "},{"location":"API/datasets/waterclusters3_30.html","title":"Waterclusters3_30","text":""},{"location":"API/datasets/waterclusters3_30.html#openqdc.datasets.potential.waterclusters3_30.WaterClusters","title":"WaterClusters","text":"

    Bases: BaseDataset

    The WaterClusters dataset contains putative minima and low energy networks for water clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.

    Chemical Species

    [\"H\", \"O\"]

    Usage:

    from openqdc.datasets import WaterClusters\ndataset = WaterClusters()\n

    References

    https://doi.org/10.1063/1.5128378

    https://sites.uw.edu/wdbase/database-of-water-clusters/

    Source code in openqdc/datasets/potential/waterclusters3_30.py
    class WaterClusters(BaseDataset):\n    \"\"\"\n    The WaterClusters dataset contains putative minima and low energy networks for water\n    clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with\n    the TTM2.1-F ab-initio based interaction potential for water.\n    It contains approximately 4.5 mil. structures.\n    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.\n\n    Chemical Species:\n        [\"H\", \"O\"]\n\n    Usage:\n    ```python\n    from openqdc.datasets import WaterClusters\n    dataset = WaterClusters()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5128378\\n\n        https://sites.uw.edu/wdbase/database-of-water-clusters/\\n\n    \"\"\"\n\n    __name__ = \"waterclusters3_30\"\n\n    # Energy in hartree, all zeros by default\n    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [PotentialMethod.TTM2_1_F]  # \"ttm2.1-f\"\n    energy_target_names = [\"TTM2.1-F Potential\"]\n    __links__ = {\"W3-W30_all_geoms_TTM2.1-F.zip\": \"https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n\"}\n\n    def read_raw_entries(self):\n        samples = []\n        parent_folder = p_join(self.root, \"W3-W30_all_geoms_TTM2.1-F/\")\n        for i in range(3, 31):\n            name = f\"W{i}_geoms_all\"\n            zip_path = p_join(parent_folder, f\"{name}.zip\")\n            xyz_path = p_join(parent_folder, f\"{name}.xyz\")\n            with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n                zip_ref.extractall(parent_folder)\n\n            data = read_xyz(xyz_path, i)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/x40.html","title":"X40","text":""},{"location":"API/datasets/x40.html#openqdc.datasets.interaction.x40.X40","title":"X40","text":"

    Bases: YamlDataset

    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules where the halogens participate in various interaction types such as electrostatic interactions, london dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are computed with CCSD(T)/CBS level of theory.

    Usage:

    from openqdc.datasets import X40\ndataset = X40()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct300647k

    Source code in openqdc/datasets/interaction/x40.py
    class X40(YamlDataset):\n    \"\"\"\n    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules\n    where the halogens participate in various interaction types such as electrostatic interactions, london\n    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic\n    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries\n    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are\n    computed with CCSD(T)/CBS level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import X40\n    dataset = X40()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct300647k\n    \"\"\"\n\n    __name__ = \"x40\"\n    __energy_methods__ = [\n        InteractionMethod.CCSD_T_CBS,  # \"CCSD(T)/CBS\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.DCCSDT_HA_DZ,  # \"dCCSD(T)/haDZ\",\n        InteractionMethod.DCCSDT_HA_TZ,  # \"dCCSD(T)/haTZ\",\n        InteractionMethod.MP2_5_CBS_ADZ,  # \"MP2.5/CBS(aDZ)\",\n    ]\n    __links__ = {\n        \"x40.yaml\": \"http://cuby4.molecular.cz/download_datasets/x40.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/X40.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.shortname\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        xyz_path = p_join(root, f\"{filename}.xyz\")\n        with open(xyz_path, \"r\") as xyz_file:  # avoid not closing the file\n            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))\n            setup = lines.pop(1)\n            n_atoms_first = setup[0].split(\"-\")[1]\n            n_atoms_ptr = np.array([int(n_atoms_first)], dtype=np.int32)\n            return n_atoms_ptr\n
    "},{"location":"tutorials/usage.html","title":"OpenQDC Hands-on Tutorial","text":"In\u00a0[31]: Copied!
    from openqdc.datasets import Spice\nds = Spice(\n    energy_unit=\"kcal/mol\",\n    distance_unit=\"ang\",\n)\n
    from openqdc.datasets import Spice ds = Spice( energy_unit=\"kcal/mol\", distance_unit=\"ang\", )
    2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:381 - Reading preprocessed data.\n2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:382 - Dataset spice with the following units:\n                     Energy: hartree,\n                     Distance: bohr,\n                     Forces: hartree/bohr\n2024-02-29 12:17:13.978 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded atomic_inputs with shape (33175288, 5), dtype float32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded position_idx_range with shape (1110165, 2), dtype int32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded energies with shape (1110165, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded forces with shape (33175288, 3, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded name with shape (1110165,), dtype <U632\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded subset with shape (1110165,), dtype <U20\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded n_atoms with shape (1110165,), dtype int32\n2024-02-29 12:17:13.983 | INFO     | openqdc.datasets.base:_precompute_statistics:154 - Loaded precomputed statistics\n2024-02-29 12:17:13.985 | INFO     | openqdc.datasets.base:_convert_data:141 - Converting spice data to the following units:\n                     Energy: kcal/mol,\n                     Distance: ang,\n                     Forces: kcal/mol/ang\n
    In\u00a0[39]: Copied!
    ds[0]\n
    ds[0] Out[39]:
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n        [ 0.06135919,  2.6528177 , -0.4163168 ],\n        [ 1.762424  ,  1.0939031 , -1.4321265 ],\n        [-0.22598556,  1.6802124 ,  0.5978407 ],\n        [ 1.1740401 , -0.04154727, -0.512898  ],\n        [-0.41957757, -0.24454471,  3.0900123 ],\n        [ 0.7238282 ,  0.52511275,  0.8248042 ],\n        [ 0.05533566, -0.6713925 ,  1.6488242 ],\n        [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n        [-0.0657557 ,  1.8550861 , -2.3939755 ],\n        [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n        [-0.8098082 ,  3.201651  , -0.6507186 ],\n        [ 0.792407  ,  3.368585  ,  0.01799216],\n        [ 2.558414  ,  1.5826052 , -0.9704587 ],\n        [ 2.166226  ,  0.64460325, -2.384977  ],\n        [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n        [-1.1792994 ,  1.1978384 ,  0.34465855],\n        [ 1.8563557 , -0.90775317, -0.5115611 ],\n        [ 0.31435642, -0.42179283, -1.0628686 ],\n        [ 0.42152542,  0.25200853,  3.627957  ],\n        [-0.5416419 , -1.1152233 ,  3.7040234 ],\n        [-1.1868238 ,  0.46580845,  3.0541756 ],\n        [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n        [-0.7720179 , -0.9603249 ,  0.994841  ],\n        [ 1.7518724 , -1.5571898 ,  2.560223  ],\n        [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n        [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32),\n 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n        1, 1, 1, 1, 1], dtype=int32),\n 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0, 0, 0, 0], dtype=int32),\n 'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[40]: Copied!
    ds.get_ase_atoms(0)\n
    ds.get_ase_atoms(0) Out[40]:
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)
    In\u00a0[53]: Copied!
    ds.get_ase_atoms(0).info\n
    ds.get_ase_atoms(0).info Out[53]:
    {'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[41]: Copied!
    for i in ds.as_iter():\n    print(i)\n    break\n
    for i in ds.as_iter(): print(i) break
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n       [ 0.06135919,  2.6528177 , -0.4163168 ],\n       [ 1.762424  ,  1.0939031 , -1.4321265 ],\n       [-0.22598556,  1.6802124 ,  0.5978407 ],\n       [ 1.1740401 , -0.04154727, -0.512898  ],\n       [-0.41957757, -0.24454471,  3.0900123 ],\n       [ 0.7238282 ,  0.52511275,  0.8248042 ],\n       [ 0.05533566, -0.6713925 ,  1.6488242 ],\n       [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n       [-0.0657557 ,  1.8550861 , -2.3939755 ],\n       [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n       [-0.8098082 ,  3.201651  , -0.6507186 ],\n       [ 0.792407  ,  3.368585  ,  0.01799216],\n       [ 2.558414  ,  1.5826052 , -0.9704587 ],\n       [ 2.166226  ,  0.64460325, -2.384977  ],\n       [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n       [-1.1792994 ,  1.1978384 ,  0.34465855],\n       [ 1.8563557 , -0.90775317, -0.5115611 ],\n       [ 0.31435642, -0.42179283, -1.0628686 ],\n       [ 0.42152542,  0.25200853,  3.627957  ],\n       [-0.5416419 , -1.1152233 ,  3.7040234 ],\n       [-1.1868238 ,  0.46580845,  3.0541756 ],\n       [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n       [-0.7720179 , -0.9603249 ,  0.994841  ],\n       [ 1.7518724 , -1.5571898 ,  2.560223  ],\n       [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n       [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32), 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1], dtype=int32), 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0], dtype=int32), 'e0': array([[-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-33939.41501837],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ]]), 'energies': array([-232450.64], dtype=float32), 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]', 'subset': 'PubChem', 'forces': array([[[  2.1335483 ],\n        [-37.241825  ],\n        [ 22.830988  ]],\n\n       [[ 68.235725  ],\n        [ 59.30573   ],\n        [-27.672606  ]],\n\n       [[-34.137283  ],\n        [-30.504696  ],\n        [-33.670048  ]],\n\n       [[-49.57814   ],\n        [-75.2747    ],\n        [ 32.80194   ]],\n\n       [[  8.196513  ],\n        [ 17.132149  ],\n        [-36.84995   ]],\n\n       [[ 67.39872   ],\n        [ -8.923976  ],\n        [-20.772083  ]],\n\n       [[ 45.424217  ],\n        [-33.559574  ],\n        [ 20.30243   ]],\n\n       [[-13.522426  ],\n        [ 79.690094  ],\n        [ 15.531546  ]],\n\n       [[ 35.77895   ],\n        [  1.9324436 ],\n        [ -8.205132  ]],\n\n       [[ -3.3487453 ],\n        [ -7.991125  ],\n        [ -9.71156   ]],\n\n       [[  1.4049193 ],\n        [ 13.497365  ],\n        [ -5.981079  ]],\n\n       [[-21.196207  ],\n        [ 16.861713  ],\n        [ -1.7730864 ]],\n\n       [[-10.805695  ],\n        [ -2.033095  ],\n        [ -4.2524548 ]],\n\n       [[ 35.204765  ],\n        [ 12.971134  ],\n        [ 22.815577  ]],\n\n       [[-11.87403   ],\n        [ 10.404548  ],\n        [ 23.009806  ]],\n\n       [[  2.3782759 ],\n        [ 19.309696  ],\n        [ 15.546526  ]],\n\n       [[ -2.5732849 ],\n        [ -4.098344  ],\n        [ -5.087256  ]],\n\n       [[  3.5987573 ],\n        [ 10.469024  ],\n        [  9.869113  ]],\n\n       [[ -8.646548  ],\n        [ -0.35554707],\n        [  1.7650104 ]],\n\n       [[ -6.6712875 ],\n        [ -0.7742697 ],\n        [-15.672442  ]],\n\n       [[-25.453985  ],\n        [ -9.350726  ],\n        [  6.0056353 ]],\n\n       [[-32.657543  ],\n        [ 10.617167  ],\n        [  2.516469  ]],\n\n       [[-23.541552  ],\n        [ -9.305013  ],\n        [ -9.855984  ]],\n\n       [[  2.8105662 ],\n        [-13.78966   ],\n        [ 10.141727  ]],\n\n       [[-29.951014  ],\n        [ -9.25683   ],\n        [-23.69946   ]],\n\n       [[ -3.412568  ],\n        [  4.13157   ],\n        [ 12.421117  ]],\n\n       [[  4.77353   ],\n        [-13.841051  ],\n        [  7.6428723 ]]], dtype=float32)}\n
    In\u00a0[42]: Copied!
    for i in ds.as_iter(atoms=True):\n    print(i)\n    break\n
    for i in ds.as_iter(atoms=True): print(i) break
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)\n
    In\u00a0[43]: Copied!
    from openqdc.methods import QmMethod\n\n# Get the b3lyp/6-31g* method\nmethod = QmMethod.B3LYP_6_31G_D\nmethod.atom_energies_dict\n
    from openqdc.methods import QmMethod # Get the b3lyp/6-31g* method method = QmMethod.B3LYP_6_31G_D method.atom_energies_dict Out[43]:
    {('H', -1): -0.4618190740256503,\n ('H', 0): -0.5002733301377901,\n ('H', 1): 0.0,\n ('Li', 1): -7.284546111273075,\n ('B', -3): -23.577268753399462,\n ('B', -1): -24.614577395156598,\n ('B', 0): -24.65435524492553,\n ('B', 3): -22.018169862974275,\n ('C', -1): -37.844269871879376,\n ('C', 0): -37.84628033285479,\n ('C', 1): -37.42731164237431,\n ('N', -1): -54.52864356359092,\n ('N', 0): -54.584488815424095,\n ('N', 1): -54.0458621835885,\n ('O', -1): -75.05272792994404,\n ('O', 0): -75.06062109946738,\n ('O', 1): -74.54659271939704,\n ('F', -1): -99.75408410035712,\n ('F', 0): -99.71553471526475,\n ('Na', 1): -162.081235395777,\n ('Mg', 2): -199.22734695613283,\n ('Si', 4): -285.5564410277949,\n ('Si', 0): -289.3717359984153,\n ('Si', -4): -288.02795351148654,\n ('P', 0): -341.2580911838578,\n ('P', 1): -340.8765976669208,\n ('S', -1): -398.16568433994024,\n ('S', 0): -398.1049932797066,\n ('S', 1): -397.7199808615457,\n ('Cl', -2): -459.5066184980746,\n ('Cl', -1): -460.25223446009306,\n ('Cl', 0): -460.13624346967765,\n ('Cl', 2): -458.6740467177361,\n ('K', 1): -599.7247062673807,\n ('Ca', 2): -676.8667395990246,\n ('Br', -1): -2573.824201570383,\n ('Br', 0): -2573.705283744811,\n ('I', -1): None,\n ('I', 0): None}
    In\u00a0[44]: Copied!
    # Get the matrix of atomization energies for the b3lyp/6-31g* method\nmethod.atom_energies_matrix\n
    # Get the matrix of atomization energies for the b3lyp/6-31g* method method.atom_energies_matrix Out[44]:
    array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])
    In\u00a0[45]: Copied!
    import matplotlib.pyplot as plt \nfrom sklearn.decomposition import PCA\ndatum = ds.soap_descriptors(n_samples=500, progress=True)\nreducer = PCA()\nembedding = reducer.fit_transform(datum[\"soap\"])\n
    import matplotlib.pyplot as plt from sklearn.decomposition import PCA datum = ds.soap_descriptors(n_samples=500, progress=True) reducer = PCA() embedding = reducer.fit_transform(datum[\"soap\"])
    100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 500/500 [00:01<00:00, 459.21it/s]\n
    In\u00a0[46]: Copied!
    plt.scatter(\n    embedding[:, 0],\n    embedding[:, 1],\n    c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\nplt.colorbar()\n
    plt.scatter( embedding[:, 0], embedding[:, 1], c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]]) plt.colorbar() Out[46]:
    <matplotlib.colorbar.Colorbar at 0x1554aa7bd820>
    "},{"location":"tutorials/usage.html#openqdc-hands-on-tutorial","title":"OpenQDC Hands-on Tutorial\u00b6","text":""},{"location":"tutorials/usage.html#instantiate-and-go","title":"Instantiate and GO!\u00b6","text":"

    If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go. Change of units is done automatically upon loading based on the units of the dataset.

    Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    Supported distance units: [\"ang\", \"nm\", \"bohr\"]

    "},{"location":"tutorials/usage.html#items-from-the-dataset-object-class-are-obtained-through-the-get-method","title":"Items from the dataset object class are obtained through the \"get\" method.\u00b6","text":"

    The dictionary of the item contains different important keys:

    • 'positions' : numpy array of the 3d atomic positions (n x 3)
    • 'atomic_numbers': numpy array of the atomic numbers (n)
    • 'charges': numpy array of the formal charges for the molecule (n)
    • 'e0': isolated atom energy of the atoms in the molecule (n x n_level_of_theories)
    • 'energies': potential energy of the molecule (n_level_of_theries)
    • 'name': name or smiles (is present) of the molecule
    • 'subset': subset of the dataset the molecule belongs to
    • 'forces': if present, the forces on the atoms (n x 3 x n_level_of_theories_forces)
    "},{"location":"tutorials/usage.html#alternatively-we-can-also-retrieve-the-data-from-the-dataset-object-class-as-aseatoms-using-the-get_ase_atoms","title":"Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!\u00b6","text":""},{"location":"tutorials/usage.html#iterators","title":"Iterators\u00b6","text":"

    The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item.

    "},{"location":"tutorials/usage.html#isolated-atoms-energies-e0s","title":"Isolated atoms energies [e0s]\u00b6","text":"

    The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.

    $U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$

    The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow

    "},{"location":"tutorials/usage.html#chemical-space-from-soap-descriptors","title":"Chemical space from SOAP descriptors\u00b6","text":"

    openQDC offer a simple way to calculate the Smooth Overlaps of Atomic Positions (SOAP) descriptors for the molecules in the dataset. The method get_soap_descriptors returns the SOAP descriptors for the molecules in the dataset.

    "}]} \ No newline at end of file diff --git a/stable/sitemap.xml b/stable/sitemap.xml index f80f24a..c75575f 100644 --- a/stable/sitemap.xml +++ b/stable/sitemap.xml @@ -2,237 +2,252 @@ https://github.com/valence-labs/openQDC/stable/index.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/cli.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/contribute.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/data_storage.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/dataset_upload.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/datasets.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/licensing.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/normalization_e0s.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/usage.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/basedataset.html - 2024-07-23 + 2024-07-24 + daily + + + https://github.com/valence-labs/openQDC/stable/API/e0_dispatcher.html + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/formats.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/methods.html - 2024-07-23 + 2024-07-24 + daily + + + https://github.com/valence-labs/openQDC/stable/API/properties.html + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/regressor.html - 2024-07-23 + 2024-07-24 + daily + + + https://github.com/valence-labs/openQDC/stable/API/statistics.html + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/units.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/utils.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/alchemy.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/ani.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/comp6.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/des.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/gdml.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/geom.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/iso_17.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/l7.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/md22.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/metcalf.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/molecule3d.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/multixcqm9.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/nabladft.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/orbnet_denali.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/pcqm.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/proteinfragments.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/qm1b.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/qm7x.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/qmugs.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/qmx.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/revmd17.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/sn2_rxn.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/solvated_peptides.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/spice.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/splinter.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/tmqm.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/transition1x.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/vqm24.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/waterclusters.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/waterclusters3_30.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/API/datasets/x40.html - 2024-07-23 + 2024-07-24 daily https://github.com/valence-labs/openQDC/stable/tutorials/usage.html - 2024-07-23 + 2024-07-24 daily \ No newline at end of file diff --git a/stable/sitemap.xml.gz b/stable/sitemap.xml.gz index 29f8acc2e16cad06b228548ad9ff3ec48d207d7f..540ca79864239af8781d6bd393093ccd5391aeee 100644 GIT binary patch literal 581 zcmV-L0=oSliwFn+M4)B@|8r?{Wo=<_E_iKh0Nt5Ui`y^^$KUT$l)kr4)3WYH+hc4D z_O`+H92{Fs1hSo4O559~pL5q8{QzU2<;9M0l;7XTiX88ru7PG}$c!oO=c{EkpFwoV zW9;_xpFh4V-p`*754$lFgUrg{$oqLOGJG)Ra=9!S1%e%TnQ~{?_2q`m&1kCSIYy^`;ED+3tq9-o_4&8BPapk)C(v&hmUYWQ9NT0ku?jaXr76g6h8M+M2Nnenct{@9KeUm;$ zRJb}vLDEP4r^qsvrj6CJVgV8L!k59rJH7ZG6i0sN11YA56!g*~r)IU)%TJ2ggRg7F z^dQdCQL%tT;6{SASFB+q!K*ukN4*+iFiwFpS@}6b_|8r?{Wo=<_E_iKh0Nt5Ui`y^^$KUT$l)N`i)3WYHn`3MY z_O`+H9GqB946>bCa+}+ypL5rZet=`3`NfG)l;7XTih>VMv%}Q|BH7@N#ip!^75F|F z>xX0U^T(I<`{Jp2IHZUKyvoj=j>X74eNgIpy_RGdoSvu*aZuEc`G(ZZsH*a@XdYI1 zECF7xUd91K-z-rmhZ!u7sVn>7)J0={JFHQ6q{0OL`{z%JSa%4@dINJgvcrQ~9>0&K ziCr*FUDdmFwO`lU>~gc+x%H#=1Dqn98>1~gAJm=Y6;u7?d;{Vhtt{T)Iod8Uh{86S zbxRzg9^`@NUDc)ugEkV;Nfb2N9z5mBCW_jJ=rr12nsbS6p^~-ciKOkykKewkE+_Xb z#l^9omV6PmGZ$_LJO(3=JHilwC`57O$yOUV$rEzJJp9Kr>RxW&Yp5DpD zHy}9k2jfsMJp?D0-co2cd%655nBAGW7EBM^Ydi@S5D@x=ur-1;BxF1HXZI*qgU=>y zPApeJh@A&+4S0=$d!Q-jkzo257_2` + + + + + + + + +
    + +
  • + + + + + + + + + +
  • diff --git a/stable/usage.html b/stable/usage.html index 8e4904b..a902358 100644 --- a/stable/usage.html +++ b/stable/usage.html @@ -779,12 +779,50 @@ + + + + + + + +
  • + + + + + + + + + + +
  • + + + + + + + + + +
  • @@ -1855,6 +1964,11 @@

    Iteratorsprint(data) # Atoms object break

  • +

    or if you want to just iterate over the data:

    +
    for data in dataset:
    +    print(data) # dict of arrays
    +    break
    +

    Lazy loading

    OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during import openqdc as qdc. In case of trouble you can always disable lazy loading by setting the environment variable OPENQDC_DISABLE_LAZY_LOADING to 1.