Skip to content

Commit

Permalink
Merge pull request #674 from GraphScope/test_arxiv
Browse files Browse the repository at this point in the history
Extract arxiv ids from references to get more cited papers
  • Loading branch information
Louyk14 authored Jan 14, 2025
2 parents 4e24433 + 840301d commit 72fb94e
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
26 changes: 26 additions & 0 deletions python/graphy/extractor/paper_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def __init__(
self.fake_extractor = False
self.linked_contents = set()
self.input_meta_data = {}
self.reference_page = -1

try:
if meta_path:
Expand Down Expand Up @@ -598,6 +599,29 @@ def _match_type(self, keywords: List[str], string: str, index: str):
return True
return False

def _extract_arxiv_link(self, page_num):
if self.reference_page > -1 and page_num < self.reference_page:
return

page_text = self.page_texts[page_num]
for block in page_text["blocks"]:
if block["type"] == 1:
continue
for line in block["lines"]:
succ, line_text = self._get_line_text(line)
if succ:
line_text = line_text.lower().strip()
if self.reference_page > -1:
pattern = r"(arXiv[.:]\d{4}\.\d{5}|abs/\d{4}\.\d{5})"
matches = re.findall(pattern, line_text)
for match in matches:
self.linked_contents.add(match)
else:
if line_text == "references" or line_text == "reference":
self.reference_page = page_num
else:
continue

def _extract_link(self, page, page_num):
"""Extract links from a given PDF page and retrieve the associated text."""
page_width = page.rect.width
Expand Down Expand Up @@ -883,6 +907,8 @@ def _extract_link(self, page, page_num):

logger.debug(f"Extracted {len(link_info)} links")

self._extract_arxiv_link(page_num)

return link_info

def _check_possible_section_title(
Expand Down
31 changes: 31 additions & 0 deletions python/graphy/utils/arxiv_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,34 @@ def __init__(

self.result_former = ResultFormer()

def find_paper_with_arxiv_id(self, name):
name = name.strip()
pattern = r"(?:arXiv[.:]|abs/)(\d{4}\.\d{5})(v\d+)?"
matches = re.findall(pattern, name)

results = [match[0] + (match[1] if match[1] else "") for match in matches]

if results:
best_match = None
highest_similarity = 0.0
similarity = 0.0

search_by_id = arxiv.Search(id_list=results)
for paper in self.client.results(search_by_id):
similarity = difflib.SequenceMatcher(
None, name.lower(), paper.title.lower()
).ratio()

if similarity > highest_similarity:
highest_similarity = similarity
best_match = paper

highest_similarity = 1

return highest_similarity, best_match
else:
return 0.0, None

def find_paper_from_arxiv(self, name, max_results):
new_names = sorted(
[s.strip() for s in re.split(r"[.\\/]", name.strip()) if len(s) >= 20],
Expand Down Expand Up @@ -133,6 +161,7 @@ def find_paper_from_arxiv(self, name, max_results):
traceback.print_exc()

if highest_similarity > 0.9:
# print(f"found {query}")
break
# logger.warning(f"Not Found: {query}")

Expand All @@ -142,6 +171,8 @@ def find_paper_from_arxiv(self, name, max_results):
best_match = None
highest_similarity = 0.0

if not best_match:
return self.find_paper_with_arxiv_id(name)
return highest_similarity, best_match

def download_paper(self, name: str, max_results):
Expand Down

0 comments on commit 72fb94e

Please sign in to comment.