Skip to content

Commit

Permalink
add ref handler that properly handles multiple references on one stat…
Browse files Browse the repository at this point in the history
…ement
  • Loading branch information
stuppie committed Sep 15, 2017
1 parent 59ca195 commit 2e07332
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 0 deletions.
1 change: 1 addition & 0 deletions wikidataintegrator/ref_handlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .update_retrieved_if_new import update_retrieved_if_new
from .update_retrieved_if_new_multiple_refs import update_retrieved_if_new_multiple_refs
from .strict_overwrite import strict_overwrite
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
####
# custom ref handler
# If the newref is the same as the oldref except the retrieved date is `days` newer, overwrite
# the retrieved date is NOT `days` newer, keep the old ref
# If the refs are different (in any other way), overwrite with new ref
#
####
from datetime import datetime


def update_retrieved_if_new_multiple_refs(olditem, newitem, days=180):
"""
# modifies olditem in place
# any ref that does not exactly match the new proposed reference (not including retrieved) is kept
"""

def is_equal_not_retrieved(oldref, newref):
"""
Return True if the oldref == newref, NOT including any "retrieved" statements
:param oldref:
:param newref:
:return:
"""
if len(oldref) != len(newref):
return False
oldref_minus_retrieved = [x for x in oldref if x.get_prop_nr() != 'P813']
newref_minus_retrieved = [x for x in newref if x.get_prop_nr() != 'P813']
if not all(x in oldref_minus_retrieved for x in newref_minus_retrieved):
return False
oldref_retrieved = [x for x in oldref if x.get_prop_nr() == 'P813']
newref_retrieved = [x for x in newref if x.get_prop_nr() == 'P813']
if (len(newref_retrieved) != len(oldref_retrieved)):
return False
return True

def ref_overwrite(oldref, newref, days):
"""
If the newref is the same as the oldref except the retrieved date is `days` newer, return True
the retrieved date is NOT `days` newer, return False
the refs are different, return True
"""
if len(oldref) != len(newref):
return True
oldref_minus_retrieved = [x for x in oldref if x.get_prop_nr() != 'P813']
newref_minus_retrieved = [x for x in newref if x.get_prop_nr() != 'P813']
if not all(x in oldref_minus_retrieved for x in newref_minus_retrieved):
return True
oldref_retrieved = [x for x in oldref if x.get_prop_nr() == 'P813']
newref_retrieved = [x for x in newref if x.get_prop_nr() == 'P813']
if (len(newref_retrieved) != len(oldref_retrieved)) or not (
len(newref_retrieved) == len(oldref_retrieved) == 1):
return True
datefmt = '+%Y-%m-%dT%H:%M:%SZ'
retold = list([datetime.strptime(r.get_value()[0], datefmt) for r in oldref if r.get_prop_nr() == 'P813'])[0]
retnew = list([datetime.strptime(r.get_value()[0], datefmt) for r in newref if r.get_prop_nr() == 'P813'])[0]
return (retnew - retold).days >= days

newrefs = newitem.references
oldrefs = olditem.references

found_mate = [False] * len(newrefs)
for new_n, newref in enumerate(newrefs):
for old_n, oldref in enumerate(oldrefs):
if is_equal_not_retrieved(oldref, newref):
found_mate[new_n] = True
if ref_overwrite(oldref, newref, days):
oldrefs[old_n] = newref
for f_idx, f in enumerate(found_mate):
if not f:
oldrefs.append(newrefs[f_idx])

0 comments on commit 2e07332

Please sign in to comment.