From a8b7c0c6c4363adfadaf486d1847f9793c5a07d5 Mon Sep 17 00:00:00 2001 From: louyk18 <14280048+louyk18@user.noreply.gitee.com> Date: Mon, 13 Jan 2025 17:19:13 +0800 Subject: [PATCH] improve performance --- python/graphy/utils/arxiv_fetcher.py | 33 ++++++---------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/python/graphy/utils/arxiv_fetcher.py b/python/graphy/utils/arxiv_fetcher.py index 6a07c192..0f342464 100644 --- a/python/graphy/utils/arxiv_fetcher.py +++ b/python/graphy/utils/arxiv_fetcher.py @@ -81,7 +81,7 @@ def __init__( def find_paper_from_arxiv(self, name, max_results): new_names = sorted( - [s for s in re.split(r"[.\\/]", name.strip()) if len(s) >= 20], + [s.strip() for s in re.split(r"[.\\/]", name.strip()) if len(s) >= 20], key=len, reverse=True, ) @@ -132,7 +132,7 @@ def find_paper_from_arxiv(self, name, max_results): except Exception as e: traceback.print_exc() - if highest_similarity > 0.9 or found_result: + if highest_similarity > 0.9: break logger.warning(f"Not Found: {query}") @@ -230,27 +230,8 @@ def fetch_papers_concurrently( if __name__ == "__main__": - filenames = [] - - # Traverse the directory - for root, dirs, files in os.walk("inputs/download"): - for file in files: - # Append file names to the list - filenames.append(file) - - for file_name in filenames: - download_foler = os.path.join(f"{WF_DOWNLOADS_DIR}", file_name.split(".")[0]) - fetcher = ArxivFetcher(download_folder=download_foler) - os.makedirs(download_foler) - path = os.path.join("inputs", "download", file_name) - with open(f"{path}", "r") as f: - papers = f.readlines() - line_counter = 0 - for paper in papers: - line_counter += 1 - if line_counter % 2 == 0: - continue - paper = paper.strip() - fetcher.download_paper(paper, 5) - # fetcher.download_paper(paper.split(",")[0], 5) - # print(json.dumps(data, indent=2)) + af = ArxivFetcher() + output = af.find_paper_from_arxiv( + "Peiyi Wang, Lei Li, Zhihong Shao, RX Xu, Damai Dai, Yifei Li, Deli Chen, Y Wu, and Zhifang Sui. Math-shepherd: Verify and reinforce llms step-by-step without human annotations. CoRR, abs/2312.08935 , 2023a." + ) + print(output)