You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository has been archived by the owner on Apr 15, 2024. It is now read-only.
I wrote code that parses pdfs to text and the below error keeps occurring on one of the pdfs which is completely similar to the others that the package successfully parses (attached the pdf). The error happened in the following function I made:
def pdf_to_text(input_file, output):
i_f = open(input_file,'rb')
resMgr = PDFResourceManager()
retData = io.StringIO()
TxtConverter = TextConverter(resMgr,retData, laparams= LAParams())
interpreter = PDFPageInterpreter(resMgr,TxtConverter)
for page in PDFPage.get_pages(i_f):
interpreter.process_page(page)
txt = retData.getvalue()
with open(os.getcwd() + '\\PDFs\\' + output,'w', encoding='utf-8-sig') as of:
of.write(txt)
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
850
851 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
--> 852 """Move text position and set leading"""
853 tx = cast(float, tx)
854 ty = cast(float, ty)
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in init_resources(self, resources)
354
355 def init_resources(self, resources: Dict[object, object]) -> None:
--> 356 """Prepare the fonts and XObjects listed in the Resource attribute."""
357 self.resources = resources
358 self.fontmap: Dict[object, PDFFont] = {}
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in get_font(self, objid, spec)
202 else:
203 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
--> 204 if settings.STRICT:
205 if spec["Type"] is not LITERAL_FONT:
206 raise PDFFontError("Type is not /Font")
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in get_font(self, objid, spec)
193 return CMapDB.get_cmap(cmapname)
194 except CMapDB.CMapNotFound:
--> 195 if strict:
196 raise
197 return CMap()
~\anaconda3\lib\site-packages\pdfminer\pdffont.py in init(self, rsrcmgr, spec)
665 self.gid2code[gid] = code
666 elif format == b"\x01":
--> 667 # Format 1
668 (n,) = struct.unpack("B", self.fp.read(1))
669 code = 0
~\anaconda3\lib\site-packages\pdfminer\pdftypes.py in get_data(self)
290 if name in self.attrs:
291 return self.attrs[name]
--> 292 return default
293
294 def get_filters(self) -> List[Tuple[Any, Any]]:
~\anaconda3\lib\site-packages\pdfminer\pdftypes.py in decode(self)
271 else:
272 assert self.data is not None
--> 273 return "<PDFStream(%r): len=%d, %r>" % (
274 self.objid,
275 len(self.data),
TypeError: argument of type 'NoneType' is not iterable
The text was updated successfully, but these errors were encountered:
I wrote code that parses pdfs to text and the below error keeps occurring on one of the pdfs which is completely similar to the others that the package successfully parses (attached the pdf). The error happened in the following function I made:
0040a8d544f3eb073be9de24a4eee14e.pdf
~\AppData\Local\Temp/ipykernel_11728/660795657.py in pdf_to_text(input_file, output)
6 interpreter = PDFPageInterpreter(resMgr,TxtConverter)
7 for page in PDFPage.get_pages(i_f):
----> 8 interpreter.process_page(page)
9
10 txt = retData.getvalue()
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
839 self.textstate.rise = cast(float, rise)
840 return
--> 841
842 def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
843 """Move text position"""
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
850
851 def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
--> 852 """Move text position and set leading"""
853 tx = cast(float, tx)
854 ty = cast(float, ty)
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in init_resources(self, resources)
354
355 def init_resources(self, resources: Dict[object, object]) -> None:
--> 356 """Prepare the fonts and XObjects listed in the Resource attribute."""
357 self.resources = resources
358 self.fontmap: Dict[object, PDFFont] = {}
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in get_font(self, objid, spec)
202 else:
203 log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
--> 204 if settings.STRICT:
205 if spec["Type"] is not LITERAL_FONT:
206 raise PDFFontError("Type is not /Font")
~\anaconda3\lib\site-packages\pdfminer\pdfinterp.py in get_font(self, objid, spec)
193 return CMapDB.get_cmap(cmapname)
194 except CMapDB.CMapNotFound:
--> 195 if strict:
196 raise
197 return CMap()
~\anaconda3\lib\site-packages\pdfminer\pdffont.py in init(self, rsrcmgr, spec)
665 self.gid2code[gid] = code
666 elif format == b"\x01":
--> 667 # Format 1
668 (n,) = struct.unpack("B", self.fp.read(1))
669 code = 0
~\anaconda3\lib\site-packages\pdfminer\pdftypes.py in get_data(self)
290 if name in self.attrs:
291 return self.attrs[name]
--> 292 return default
293
294 def get_filters(self) -> List[Tuple[Any, Any]]:
~\anaconda3\lib\site-packages\pdfminer\pdftypes.py in decode(self)
271 else:
272 assert self.data is not None
--> 273 return "<PDFStream(%r): len=%d, %r>" % (
274 self.objid,
275 len(self.data),
TypeError: argument of type 'NoneType' is not iterable
The text was updated successfully, but these errors were encountered: