Skip to content

Commit

Permalink
better way to catch utf8 problems
Browse files Browse the repository at this point in the history
  • Loading branch information
wo committed Sep 17, 2016
1 parent 6815362 commit 8e7f80c
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
5 changes: 1 addition & 4 deletions opp/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,7 @@ def update_db(self, **kwargs):
kwargs['last_checked'] = time.strftime('%Y-%m-%d %H:%M:%S')
query = "UPDATE sources SET {},urlhash=MD5(url) WHERE source_id = %s".format(
",".join(k+"=%s" for k in kwargs.keys()))
try:
cur.execute(query, tuple(kwargs.values()) + (self.source_id,))
except Exception as e: # hack to catch 4-byte unicode strings not supported by mysql
debug(1, "COULD NOT WRITE DOC TO DATABASE: %s", e)
cur.execute(query, tuple(kwargs.values()) + (self.source_id,))
debug(3, cur._last_executed)
db.commit()

Expand Down
10 changes: 7 additions & 3 deletions opp/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,13 @@ def process_link(li, force_reprocess=False, redir_url=None, keep_tempfiles=False
debug(2, "new source page: setting found_date to 1970")
doc.found_date = datetime(1970, 1, 1)

doc.update_db()
li.update_db(status=1, doc_id=doc.doc_id)

try:
doc.update_db()
li.update_db(status=1, doc_id=doc.doc_id)
except Exception as e: # hack to catch 4-byte unicode strings not supported by mysql
debug(1, "COULD NOT WRITE DOC TO DATABASE: %s", e)
return 0

# categorize, but only if doc has more than 1000 words --
# otherwise categorization is pretty random:
if doc.numwords > 700:
Expand Down

0 comments on commit 8e7f80c

Please sign in to comment.