Skip to content

Commit

Permalink
Merge branch 'main' into bug/17352_gracefully_handle_malformed_responses
Browse files Browse the repository at this point in the history
  • Loading branch information
okirmis authored Dec 23, 2024
2 parents e69030e + 7b107a7 commit a6e1377
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,27 @@ def load_data(self, urls: List[str]) -> List[Document]:
for url in urls:
parsed = feedparser.parse(url)
for entry in parsed.entries:
doc_id = entry.id or entry.link
if "content" in entry:
data = entry.content[0].value
else:
data = entry.description or entry.summary
doc_id = getattr(entry, "id", None) or getattr(entry, "link", None)
data = entry.get("content", [{}])[0].get(
"value", entry.get("description", entry.get("summary", ""))
)

if self.html_to_text:
import html2text

data = html2text.html2text(data)

extra_info = {"title": entry.title, "link": entry.link}
documents.append(Document(text=data, id_=doc_id, extra_info=extra_info))
extra_info = {
"title": getattr(entry, "title", None),
"link": getattr(entry, "link", None),
"date": getattr(entry, "published", None),
}

if doc_id:
documents.append(
Document(text=data, id_=doc_id, extra_info=extra_info)
)
else:
documents.append(Document(text=data, extra_info=extra_info))

return documents
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.3.1"
version = "0.3.2"

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
Expand Down

0 comments on commit a6e1377

Please sign in to comment.