Skip to content

Commit

Permalink
modify custom reference handler to allow passing a custom function to…
Browse files Browse the repository at this point in the history
… perform the ref handling
  • Loading branch information
stuppie committed Jul 12, 2017
1 parent 06672e2 commit 52f9f37
Show file tree
Hide file tree
Showing 10 changed files with 643 additions and 434 deletions.
2 changes: 2 additions & 0 deletions wikidataintegrator/ref_handlers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .update_retrieved_if_new import update_retrieved_if_new
from .strict_overwrite import strict_overwrite
10 changes: 10 additions & 0 deletions wikidataintegrator/ref_handlers/strict_overwrite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import datetime
import copy
####
# Example custom ref handler
# Always replaces all old refs with new refs
####

def strict_overwrite(olditem, newitem):
# modifies olditem in place!!!
olditem.references = newitem.references
190 changes: 190 additions & 0 deletions wikidataintegrator/ref_handlers/test_update_retrieved_if_new.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@

#### same as before, but with one ref
import copy

from wikidataintegrator import wdi_fastrun, wdi_core
from wikidataintegrator.ref_handlers import update_retrieved_if_new as custom_ref_handler


class frc_fake_query_data_paper1(wdi_fastrun.FastRunContainer):
def __init__(self, *args, **kwargs):
super(frc_fake_query_data_paper1, self).__init__(*args, **kwargs)
self.prop_data['Q15397819'] = {'P698': {
'fake statement id': {
'qual': set(),
'ref': {
'ref1': {
('P248', 'Q5412157'), # stated in Europe PubMed Central
('P813', '+2017-01-01T00:00:00Z'),
('P698', '99999999999')},
},
'v': '99999999999'}}}
self.rev_lookup = {'99999999999': {'Q15397819'}}
self.prop_dt_map = {'P527': 'wikibase-item', 'P248': 'wikibase-item', 'P698': 'external-id', 'P813': 'time'}


class fake_itemengine1(wdi_core.WDItemEngine):
def get_wd_entity(self):
# https://www.wikidata.org/w/api.php?action=wbgetclaims&entity=Q15397819&property=P698&format=json
claims = {'claims': {
'P698': [{'id': 'Q15397819$9460c2a2-4d42-adec-e841-9d5bbdc6695a',
'mainsnak': {'datatype': 'external-id',
'datavalue': {'type': 'string', 'value': '99999999999'},
'property': 'P698',
'snaktype': 'value'},
'rank': 'normal',
'references': [{'hash': '9537cf2da990a2455ab924d027a0a1e5890bde8a',
'snaks': {'P248': [{'datatype': 'wikibase-item',
'datavalue': {'type': 'wikibase-entityid',
'value': {
'entity-type': 'item',
'id': 'Q5412157',
'numeric-id': 5412157}},
'property': 'P248',
'snaktype': 'value'}],
'P698': [{'datatype': 'external-id',
'datavalue': {'type': 'string',
'value': '99999999999'},
'property': 'P698',
'snaktype': 'value'}],
'P813': [{'datatype': 'time',
'datavalue': {'type': 'time',
'value': {'after': 0,
'before': 0,
'calendarmodel': 'http://www.wikidata.org/entity/Q1985727',
'precision': 11,
'time': '+2017-01-01T00:00:00Z',
'timezone': 0}},
'property': 'P813',
'snaktype': 'value'}]},
'snaks-order': ['P248', 'P813', 'P698']}],
'type': 'statement'}]}}
d = {"aliases": {},
'descriptions': {'en': {'language': 'en', 'value': 'sdfs'}},
'id': 'Q15397819',
'labels': {'en': {'language': 'en',
'value': 'drgdsgf'}},
'lastrevid': 478075481,
'modified': '2017-04-24T20:24:05Z',
'ns': 0,
'pageid': 31211964,
'sitelinks': {},
'title': 'Q15397819',
'type': 'item'
}
d.update(claims)
return self.parse_wd_json(d)


orig_statements1 = [wdi_core.WDExternalID(value="99999999999", prop_nr="P698", references=[
[
wdi_core.WDItemID(value="Q5412157", prop_nr="P248", is_reference=True),
wdi_core.WDExternalID(value="99999999999", prop_nr="P698", is_reference=True),
wdi_core.WDTime("+2017-01-01T00:00:00Z", prop_nr="P813", is_reference=True),
]
])]


def test_ref_custom():
# custom ref mode, same retrieved date
statements = copy.deepcopy(orig_statements1)
item = fake_itemengine1(wd_item_id='Q20814663', global_ref_mode="CUSTOM", ref_handler=custom_ref_handler)
orig = item.wd_json_representation['claims']['P698']
item.update(data=statements)
new = item.wd_json_representation['claims']['P698']
require_write = not all(
any(x.equals(y, include_ref=True) for y in item.original_statements) for x in item.statements)
assert not require_write

frc = frc_fake_query_data_paper1(base_data_type=wdi_core.WDBaseDataType, engine=wdi_core.WDItemEngine, use_refs=True,
ref_handler=custom_ref_handler)
frc.debug = True
assert not frc.write_required(data=statements)

def test_ref_custom_append():
# custom ref mode, diff value, append prop
statements = copy.deepcopy(orig_statements1)
statements[0].set_value("new value")
item = fake_itemengine1(wd_item_id='Q20814663', global_ref_mode="CUSTOM", ref_handler=custom_ref_handler, append_value=['P698'])
orig = item.wd_json_representation['claims']['P698']
item.update(data=statements)
new = item.wd_json_representation['claims']['P698']
require_write = not all(
any(x.equals(y, include_ref=True) for y in item.original_statements) for x in item.statements)
assert require_write

frc = frc_fake_query_data_paper1(base_data_type=wdi_core.WDBaseDataType, engine=wdi_core.WDItemEngine, use_refs=True,
ref_handler=custom_ref_handler)
frc.debug = True
assert frc.write_required(data=statements, append_props=['P698'])

## nothing new
statements = copy.deepcopy(orig_statements1)
item = fake_itemengine1(wd_item_id='Q20814663', global_ref_mode="CUSTOM", ref_handler=custom_ref_handler,
append_value=['P698'])
orig = item.wd_json_representation['claims']['P698']
item.update(data=statements)
new = item.wd_json_representation['claims']['P698']
require_write = not all(
any(x.equals(y, include_ref=True) for y in item.original_statements) for x in item.statements)
assert not require_write

frc = frc_fake_query_data_paper1(base_data_type=wdi_core.WDBaseDataType, engine=wdi_core.WDItemEngine,
use_refs=True,
ref_handler=custom_ref_handler)
frc.debug = True
assert not frc.write_required(data=statements, append_props=['P698'])


def test_ref_custom_diff_date_year():
# replace retrieved date, one year away. should be updated
statements = copy.deepcopy(orig_statements1)
statements[0].references[0][2] = wdi_core.WDTime("+2018-04-24T00:00:00Z", prop_nr="P813", is_reference=True)
item = fake_itemengine1(wd_item_id='Q20814663', global_ref_mode="CUSTOM", ref_handler=custom_ref_handler)
orig = item.wd_json_representation['claims']['P698']
item.update(data=statements)
new = item.wd_json_representation['claims']['P698']
require_write = not all(
any(x.equals(y, include_ref=True) for y in item.original_statements) for x in item.statements)
assert require_write

frc = frc_fake_query_data_paper1(base_data_type=wdi_core.WDBaseDataType, engine=wdi_core.WDItemEngine, use_refs=True,
ref_handler=custom_ref_handler)
frc.debug = True
assert frc.write_required(data=statements)


def test_ref_custom_diff_date_month():
# replace retrieved date, one month away, should not be updated
statements = copy.deepcopy(orig_statements1)
statements[0].references[0][2] = wdi_core.WDTime("+2017-02-01T00:00:00Z", prop_nr="P813", is_reference=True)
item = fake_itemengine1(wd_item_id='Q20814663', global_ref_mode="CUSTOM", ref_handler=custom_ref_handler)
orig = item.wd_json_representation['claims']['P698']
item.update(data=statements)
new = item.wd_json_representation['claims']['P698']
require_write = not all(
any(x.equals(y, include_ref=True) for y in item.original_statements) for x in item.statements)
assert not require_write

frc = frc_fake_query_data_paper1(base_data_type=wdi_core.WDBaseDataType, engine=wdi_core.WDItemEngine, use_refs=True,
ref_handler=custom_ref_handler)
frc.debug = True
assert not frc.write_required(data=statements)


def test_ref_custom_diff_stated_in():
# diff ref stated in
statements = copy.deepcopy(orig_statements1)
statements[0].references[0][0] = wdi_core.WDItemID("Q123", prop_nr="P813", is_reference=True)
item = fake_itemengine1(wd_item_id='Q20814663', global_ref_mode="CUSTOM", ref_handler=custom_ref_handler)
orig = item.wd_json_representation['claims']['P698']
item.update(data=statements)
new = item.wd_json_representation['claims']['P698']
require_write = not all(
any(x.equals(y, include_ref=True) for y in item.original_statements) for x in item.statements)
assert require_write

frc = frc_fake_query_data_paper1(base_data_type=wdi_core.WDBaseDataType, engine=wdi_core.WDItemEngine, use_refs=True,
ref_handler=custom_ref_handler)
frc.debug = True
assert frc.write_required(data=statements)
50 changes: 50 additions & 0 deletions wikidataintegrator/ref_handlers/update_retrieved_if_new.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
####
# custom ref handler
# If the newref is the same as the oldref except the retrieved date is `days` newer, overwrite
# the retrieved date is NOT `days` newer, keep the old ref
# If the refs are different (in any other way), overwrite with new ref
#
# Only handles cases where olditem and newitem each have one reference. Otherwise, defaults to overwrite
####
from datetime import datetime
import copy

def update_retrieved_if_new(olditem, newitem, days=180):
"""
# modifies olditem in place
"""
def ref_overwrite(oldref, newref, days):
"""
If the newref is the same as the oldref except the retrieved date is `days` newer, return True
the retrieved date is NOT `days` newer, return False
the refs are different, return True
"""
if len(oldref) != len(newref):
return True
oldref_minus_retrieved = [x for x in oldref if x.get_prop_nr() != 'P813']
newref_minus_retrieved = [x for x in newref if x.get_prop_nr() != 'P813']
if not all(x in oldref_minus_retrieved for x in newref_minus_retrieved):
return True
oldref_retrieved = [x for x in oldref if x.get_prop_nr() == 'P813']
newref_retrieved = [x for x in newref if x.get_prop_nr() == 'P813']
if (len(newref_retrieved) != len(oldref_retrieved)) or not (
len(newref_retrieved) == len(oldref_retrieved) == 1):
return True
datefmt = '+%Y-%m-%dT%H:%M:%SZ'
retold = list([datetime.strptime(r.get_value()[0], datefmt) for r in oldref if r.get_prop_nr() == 'P813'])[0]
retnew = list([datetime.strptime(r.get_value()[0], datefmt) for r in newref if r.get_prop_nr() == 'P813'])[0]
return (retnew - retold).days >= days

newrefs = newitem.references
oldrefs = olditem.references
if not (len(newrefs) == len(oldrefs) == 1):
#print("overwriting refs, not 1")
olditem.references = copy.deepcopy(newitem.references)
return None
overwrite = ref_overwrite(oldrefs[0], newrefs[0], days)
if overwrite:
#print("updating ref")
olditem.references = newrefs
else:
#print("don't change")
pass
Empty file.
Loading

0 comments on commit 52f9f37

Please sign in to comment.