diff --git a/callbacks/text_mining/util.py b/callbacks/text_mining/util.py index 86c6c0cd..7594141e 100644 --- a/callbacks/text_mining/util.py +++ b/callbacks/text_mining/util.py @@ -11,6 +11,7 @@ COLNAMES = ['Gene', 'PMID', 'Title', 'Sentence', 'Score'] SIMILARITY_CUTOFF = 85 +MAX_NUM_RESULTS = 100 def sanitize_text(text): @@ -100,6 +101,7 @@ def text_mining_query_search(query_string): df = pd.DataFrame(columns=COLNAMES) pubmed_matches = set() + pubmed_matches_100 = set() with open(Constants.TEXT_MINING_ANNOTATED_ABSTRACTS, 'r', encoding='utf8') as f: for line in f: @@ -176,6 +178,12 @@ def text_mining_query_search(query_string): df.loc[len(df.index)] = [Entity, PMID, Title, Sentence, similarity.score] + if similarity.score == 100: + pubmed_matches_100.add(PMID) + + if len(pubmed_matches_100) == MAX_NUM_RESULTS: + break + except: pass