Skip to content

Commit

Permalink
Parse pdfs
Browse files Browse the repository at this point in the history
  • Loading branch information
teddygroves committed Dec 3, 2024
1 parent 7b7a98e commit b78ff63
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 21 deletions.
28 changes: 20 additions & 8 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import tempfile
from pathlib import Path

import pymupdf
import streamlit as st

from refcheck.doi import get_dois, doi_to_url
from refcheck.bibtex import fetch_bibtex
from refcheck.doi import get_dois, doi_to_url, fetch_doi_json

DOI_REGEX = "10.\\d{4,9}/[-._;()/:a-z0-9A-Z]+"
FORMAT_TO_EXT = {
"markdown": "md",
"docx": "docx",
"latex": "tex",
"pdf": "pdf",
}


Expand All @@ -28,11 +28,23 @@
path = Path(temp_dir) / uploaded_file.name
with open(path, "wb") as f:
f.write(uploaded_file.getvalue())
results = get_dois(path, format)
if format == "pdf":
doc = pymupdf.open(path)
text = (
"".join(page.get_text() for page in doc).replace("\n", " ").encode("utf-8")
)
txt_path = path.with_suffix(".txt")
with open(txt_path, "wb") as f:
f.write(text)
results = get_dois(txt_path, "rtf")
else:
results = get_dois(path, format)
urls = [doi_to_url(doi) for doi in results]
bibtexes = [fetch_bibtex(url) for url in urls]

doi_jsons = [fetch_doi_json(url) for url in urls]
st.write("Here are the dois in your document. Please check if they are correct!")
for doi, url, bibtex in zip(results, urls, bibtexes):
st.write(f"[**{doi}**]({url}):", bibtex)
for doi, url, doi_json in zip(results, urls, doi_jsons):
title = f"*{doi_json["title"]}*"
authors = ", ".join(f"{a["given"]} {a["family"]}" for a in doi_json["author"])
st.write(f"[**{doi}**]({url}): {title}", unsafe_allow_html=True)
st.write(authors)
st.write("")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"pymupdf>=1.24.14",
"pypandoc-binary>=1.14",
"streamlit>=1.40.2",
]
Expand Down
7 changes: 0 additions & 7 deletions src/refcheck/bibtex.py

This file was deleted.

8 changes: 8 additions & 0 deletions src/refcheck/doi.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import re
import requests
from pathlib import Path

import pypandoc
Expand All @@ -14,3 +16,9 @@ def get_dois(path: Path, format) -> list[str]:

def doi_to_url(doi: str) -> str:
return "http://dx.doi.org/" + doi


def fetch_doi_json(doi_url: str) -> dict:
headers = {"accept": "application/json"}
r = requests.get(doi_url, headers=headers)
return json.loads(r.text) if r is not None else dict()
Binary file added tests/example.pdf
Binary file not shown.
6 changes: 0 additions & 6 deletions tests/example.tex

This file was deleted.

17 changes: 17 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b78ff63

Please sign in to comment.