diff --git a/chemical/molecular_search/README.md b/chemical/molecular_search/README.md index 8f337e6..fcbd0ec 100644 --- a/chemical/molecular_search/README.md +++ b/chemical/molecular_search/README.md @@ -1,12 +1,10 @@ # Molecular Search -This molecular search example mainly consists of two notebooks, and two python files about "how to load large molecular formula" and "start an online service". +Drug discovery, as the source of medical innovation, is an important part of new medicine research and development. Drug discovery is implemented by target selection and confirmation. In order to discover available compounds in the fragment space from billion-scale compound libraries, chemical fingerprint is usually retrieved for substructure search and similarity search. -I think everyone can learn the basic operations of Molecular Search System through the [**getting started notebook**](./1_build_molecular_search_engine.ipynb). And the [**deep dive notebook**](./2_deep_dive_molecular_search.ipynb) will show you how to deploy the service. - -[**load.py**](./load.py) is used to import your large-scale data, and [**server.py**](./server.py) will start a FastAPI-based service. +This example will show you how to find the similar, sub or super molecular formula. It mainly consists of two notebooks, I think everyone can learn the basic operations of Molecular Search System through the [**getting started notebook**](./1_build_molecular_search_engine.ipynb). And the [**deep dive notebook**](./2_deep_dive_molecular_search.ipynb) will show you how to deploy the service. ## Learn from Notebook @@ -17,42 +15,3 @@ In this notebook you will get the prerequisites, how to complete a simple molecu - [Deep Dive](./2_deep_dive_molecular_search.ipynb) In this notebook you will learn how to improve system stability, and finally show you how to start the FastAPI service. - -## Load Large-scale Data - -I think you already know from previous notebooks that a very important step in molecular search is loading the data. If you have large-scale data, you can try running with `exception_safe` in [load.py](./load.py), which make the import process safer. - -> You can load your own data in this script. - -```bash -$ python load.py -Collection number: 10000 -``` - -## Deploy with FastAPI - -After the data is loaded, you can start the search service for molecular search, and also support inserting data services. - -```bash -$ python server.py -INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) -``` - -Next you can test the service with the following command. - -```bash -# search the similar molecular -$ curl -X POST "http://0.0.0.0:8000/similarity" --data "Cn1ccc(=O)nc1" - -# search the superstructure molecular -$ curl -X POST "http://0.0.0.0:8000/superstructure" --data "Cn1ccc(=O)nc1" - -# search the substructure molecular -$ curl -X POST "http://0.0.0.0:8000/substructure" --data "Cn1ccc(=O)nc1" - -# insert a molecular -$ curl -X POST "http://0.0.0.0:8000/insert" --data "Cn1ccc(=O)nc1" - -# count the collection -$ curl -X POST "http://0.0.0.0:8000/count" -``` diff --git a/chemical/molecular_search/load.py b/chemical/molecular_search/load.py deleted file mode 100644 index 0d076d8..0000000 --- a/chemical/molecular_search/load.py +++ /dev/null @@ -1,39 +0,0 @@ -import towhee -from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility - - -def create_milvus_collection(collection_name, dim): - connections.connect(host='127.0.0.1', port='19530') - - if utility.has_collection(collection_name): - utility.drop_collection(collection_name) - - fields = [ - FieldSchema(name='id', dtype=DataType.INT64, descrition='ids', is_primary=True, auto_id=False), - FieldSchema(name='embedding', dtype=DataType.BINARY_VECTOR, descrition='embedding vectors', dim=dim) - ] - schema = CollectionSchema(fields=fields, description='molecular similarity search') - collection = Collection(name=collection_name, schema=schema) - - return collection - - -def main(): - collection_name = 'molecular_search' - csv_file = 'pubchem_10000.smi' - algorithm = 'daylight' - milvus_collection = create_milvus_collection(collection_name, 2048) - connections.connect(host='127.0.0.1', port='19530') - - (towhee.read_csv(csv_file) - .exception_safe() - .runas_op['id', 'id'](func=lambda x: int(x)) - .molecular_fingerprinting['smiles', 'fp'](algorithm=algorithm) - .drop_empty() - .to_milvus['id', 'fp'](collection=milvus_collection, batch=100) - ) - print('Collection number: ', milvus_collection.num_entities) - - -if __name__ == '__main__': - main() diff --git a/chemical/molecular_search/server.py b/chemical/molecular_search/server.py deleted file mode 100644 index f2517b3..0000000 --- a/chemical/molecular_search/server.py +++ /dev/null @@ -1,90 +0,0 @@ -import towhee -import pandas as pd -import time -import uvicorn -from fastapi import FastAPI -from pymilvus import connections, Collection - -app = FastAPI() - -collection_name = 'molecular_search' -csv_file = 'pubchem_10000.smi' -algorithm = 'daylight' - -connections.connect(host='127.0.0.1', port='19530') -milvus_collection = Collection(collection_name) - -df = pd.read_csv(csv_file) -id_smiles = df.set_index('id')['smiles'].to_dict() - - -@towhee.register(name='get_smiles_id') -def get_smiles_id(smiles): - timestamp = int(time.time()*10000) - id_smiles[timestamp] = smiles - return timestamp - - -@towhee.register(name='milvus_insert') -class MilvusInsert: - def __init__(self, collection): - self.collection = collection - - def __call__(self, *args, **kwargs): - data = [] - for iterable in args: - data.append([iterable]) - mr = self.collection.insert(data) - self.collection.load() - return str(mr) - - -with towhee.api['smiles']() as api: - app_insert = ( - api.get_smiles_id['smiles', 'id']() - .molecular_fingerprinting['smiles', 'fp'](algorithm='daylight') - .milvus_insert[('id', 'fp'), 'res'](collection=milvus_collection) - .select['id', 'res']() - .serve('/insert', app) - ) - - -with towhee.api['smiles']() as api: - app_search = ( - api.molecular_fingerprinting['smiles', 'fp'](algorithm='daylight') - .milvus_search['fp', 'result'](collection=milvus_collection, metric_type='JACCARD') - .runas_op['result', 'similar_smile'](func=lambda res: [id_smiles[x.id] for x in res]) - .select['smiles', 'similar_smile']() - .serve('/similarity', app) - ) - - -with towhee.api['smiles']() as api: - app_search = ( - api.molecular_fingerprinting['smiles', 'fp'](algorithm='daylight') - .milvus_search['fp', 'result'](collection=milvus_collection, metric_type='SUPERSTRUCTURE') - .runas_op['result', 'superstructure'](func=lambda res: [id_smiles[x.id] for x in res]) - .select['smiles', 'superstructure']() - .serve('/superstructure', app) - ) - - -with towhee.api['smiles']() as api: - app_search = ( - api.molecular_fingerprinting['smiles', 'fp'](algorithm='daylight') - .milvus_search['fp', 'result'](collection=milvus_collection, metric_type='SUBSTRUCTURE') - .runas_op['result', 'substructure'](func=lambda res: [id_smiles[x.id] for x in res]) - .select['smiles', 'substructure']() - .serve('/substructure', app) - ) - - -with towhee.api() as api: - app_count = ( - api.map(lambda _: milvus_collection.num_entities) - .serve('/count', app) - ) - - -if __name__ == '__main__': - uvicorn.run(app=app, host='0.0.0.0', port=8000)