-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add ref handler that properly handles multiple references on one stat…
…ement
- Loading branch information
Showing
2 changed files
with
71 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from .update_retrieved_if_new import update_retrieved_if_new | ||
from .update_retrieved_if_new_multiple_refs import update_retrieved_if_new_multiple_refs | ||
from .strict_overwrite import strict_overwrite |
70 changes: 70 additions & 0 deletions
70
wikidataintegrator/ref_handlers/update_retrieved_if_new_multiple_refs.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#### | ||
# custom ref handler | ||
# If the newref is the same as the oldref except the retrieved date is `days` newer, overwrite | ||
# the retrieved date is NOT `days` newer, keep the old ref | ||
# If the refs are different (in any other way), overwrite with new ref | ||
# | ||
#### | ||
from datetime import datetime | ||
|
||
|
||
def update_retrieved_if_new_multiple_refs(olditem, newitem, days=180): | ||
""" | ||
# modifies olditem in place | ||
# any ref that does not exactly match the new proposed reference (not including retrieved) is kept | ||
""" | ||
|
||
def is_equal_not_retrieved(oldref, newref): | ||
""" | ||
Return True if the oldref == newref, NOT including any "retrieved" statements | ||
:param oldref: | ||
:param newref: | ||
:return: | ||
""" | ||
if len(oldref) != len(newref): | ||
return False | ||
oldref_minus_retrieved = [x for x in oldref if x.get_prop_nr() != 'P813'] | ||
newref_minus_retrieved = [x for x in newref if x.get_prop_nr() != 'P813'] | ||
if not all(x in oldref_minus_retrieved for x in newref_minus_retrieved): | ||
return False | ||
oldref_retrieved = [x for x in oldref if x.get_prop_nr() == 'P813'] | ||
newref_retrieved = [x for x in newref if x.get_prop_nr() == 'P813'] | ||
if (len(newref_retrieved) != len(oldref_retrieved)): | ||
return False | ||
return True | ||
|
||
def ref_overwrite(oldref, newref, days): | ||
""" | ||
If the newref is the same as the oldref except the retrieved date is `days` newer, return True | ||
the retrieved date is NOT `days` newer, return False | ||
the refs are different, return True | ||
""" | ||
if len(oldref) != len(newref): | ||
return True | ||
oldref_minus_retrieved = [x for x in oldref if x.get_prop_nr() != 'P813'] | ||
newref_minus_retrieved = [x for x in newref if x.get_prop_nr() != 'P813'] | ||
if not all(x in oldref_minus_retrieved for x in newref_minus_retrieved): | ||
return True | ||
oldref_retrieved = [x for x in oldref if x.get_prop_nr() == 'P813'] | ||
newref_retrieved = [x for x in newref if x.get_prop_nr() == 'P813'] | ||
if (len(newref_retrieved) != len(oldref_retrieved)) or not ( | ||
len(newref_retrieved) == len(oldref_retrieved) == 1): | ||
return True | ||
datefmt = '+%Y-%m-%dT%H:%M:%SZ' | ||
retold = list([datetime.strptime(r.get_value()[0], datefmt) for r in oldref if r.get_prop_nr() == 'P813'])[0] | ||
retnew = list([datetime.strptime(r.get_value()[0], datefmt) for r in newref if r.get_prop_nr() == 'P813'])[0] | ||
return (retnew - retold).days >= days | ||
|
||
newrefs = newitem.references | ||
oldrefs = olditem.references | ||
|
||
found_mate = [False] * len(newrefs) | ||
for new_n, newref in enumerate(newrefs): | ||
for old_n, oldref in enumerate(oldrefs): | ||
if is_equal_not_retrieved(oldref, newref): | ||
found_mate[new_n] = True | ||
if ref_overwrite(oldref, newref, days): | ||
oldrefs[old_n] = newref | ||
for f_idx, f in enumerate(found_mate): | ||
if not f: | ||
oldrefs.append(newrefs[f_idx]) |