Merge pull request #674 from GraphScope/test_arxiv

Extract arxiv ids from references to get more cited papers
GraphScope · Jan 14, 2025 · 72fb94e · 72fb94e
2 parents 4e24433 + 840301d
commit 72fb94e
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 0 deletions.
diff --git a/python/graphy/extractor/paper_extractor.py b/python/graphy/extractor/paper_extractor.py
@@ -164,6 +164,7 @@ def __init__(
         self.fake_extractor = False
         self.linked_contents = set()
         self.input_meta_data = {}
+        self.reference_page = -1
 
         try:
             if meta_path:
@@ -598,6 +599,29 @@ def _match_type(self, keywords: List[str], string: str, index: str):
                 return True
         return False
 
+    def _extract_arxiv_link(self, page_num):
+        if self.reference_page > -1 and page_num < self.reference_page:
+            return
+
+        page_text = self.page_texts[page_num]
+        for block in page_text["blocks"]:
+            if block["type"] == 1:
+                continue
+            for line in block["lines"]:
+                succ, line_text = self._get_line_text(line)
+                if succ:
+                    line_text = line_text.lower().strip()
+                    if self.reference_page > -1:
+                        pattern = r"(arXiv[.:]\d{4}\.\d{5}|abs/\d{4}\.\d{5})"
+                        matches = re.findall(pattern, line_text)
+                        for match in matches:
+                            self.linked_contents.add(match)
+                    else:
+                        if line_text == "references" or line_text == "reference":
+                            self.reference_page = page_num
+                        else:
+                            continue
+
     def _extract_link(self, page, page_num):
         """Extract links from a given PDF page and retrieve the associated text."""
         page_width = page.rect.width
@@ -883,6 +907,8 @@ def _extract_link(self, page, page_num):
 
             logger.debug(f"Extracted {len(link_info)} links")
 
+        self._extract_arxiv_link(page_num)
+
         return link_info
 
     def _check_possible_section_title(

diff --git a/python/graphy/utils/arxiv_fetcher.py b/python/graphy/utils/arxiv_fetcher.py
@@ -79,6 +79,34 @@ def __init__(
 
         self.result_former = ResultFormer()
 
+    def find_paper_with_arxiv_id(self, name):
+        name = name.strip()
+        pattern = r"(?:arXiv[.:]|abs/)(\d{4}\.\d{5})(v\d+)?"
+        matches = re.findall(pattern, name)
+
+        results = [match[0] + (match[1] if match[1] else "") for match in matches]
+
+        if results:
+            best_match = None
+            highest_similarity = 0.0
+            similarity = 0.0
+
+            search_by_id = arxiv.Search(id_list=results)
+            for paper in self.client.results(search_by_id):
+                similarity = difflib.SequenceMatcher(
+                    None, name.lower(), paper.title.lower()
+                ).ratio()
+
+                if similarity > highest_similarity:
+                    highest_similarity = similarity
+                    best_match = paper
+
+            highest_similarity = 1
+
+            return highest_similarity, best_match
+        else:
+            return 0.0, None
+
     def find_paper_from_arxiv(self, name, max_results):
         new_names = sorted(
             [s.strip() for s in re.split(r"[.\\/]", name.strip()) if len(s) >= 20],
@@ -133,6 +161,7 @@ def find_paper_from_arxiv(self, name, max_results):
                         traceback.print_exc()
 
                 if highest_similarity > 0.9:
+                    # print(f"found {query}")
                     break
                 # logger.warning(f"Not Found: {query}")
 
@@ -142,6 +171,8 @@ def find_paper_from_arxiv(self, name, max_results):
                 best_match = None
                 highest_similarity = 0.0
 
+        if not best_match:
+            return self.find_paper_with_arxiv_id(name)
         return highest_similarity, best_match
 
     def download_paper(self, name: str, max_results):