Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor and improve kb_tests.py #270

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ pandasql
pysolr
base36
vfb_connect
better_profanity
better_profanity
tqdm
243 changes: 151 additions & 92 deletions src/uk/ac/ebi/vfb/neo4j/neo2neo/kb_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,10 @@
import sys
import warnings
import json
from tqdm import tqdm

nc = neo4j_connect(sys.argv[1], sys.argv[2], sys.argv[3])

silent_mode = False
# prevents sys.exit(1) on failure, just silently logs the result and exits
if len(sys.argv) > 4 and sys.argv[4] == 'silent_fail':
silent_mode = True


def query(query):
def query(nc, query):
q = nc.commit_list([query])
if not q:
return False
Expand All @@ -22,7 +16,7 @@ def query(query):
return dc


def query_ind_count(query):
def query_ind_count(nc, query):
q = nc.commit_list([query])
if not q:
return False
Expand All @@ -36,92 +30,157 @@ def query_ind_count(query):
return dc[0]['ind_count']


def compare(dataset: str, description: str, query1: str, query2: str, log: dict, verbose=False):
r1 = query(query1)[0]
r2 = query(query2)[0]
def compare(nc, description: str, query1: str, query2: str, verbose=False):
r1 = query(nc, query1)[0]
r2 = query(nc, query2)[0]
if r1['ind_count'] == r2['ind_count']:
if verbose:
print(query2)
print("Testing assertion:" + description)
print("Result: True")
return True
return None
else:
print("Testing assertion:" + description)
print(query2)
print("Result: inds_in_datset: %d ; Compliant with pattern: %d" % (r1['ind_count'], r2['ind_count']))
log[description + '. Failing Inds'] = list(set(r1['ind_list']) - set(r2['ind_list']))
return False


datasets = nc.commit_list(
["MATCH (ds:DataSet) RETURN ds.short_form"]) # removed "WHERE ds.schema = 'image'" as not in kb2
dc = results_2_dict_list(datasets)

return_state = True

for d in dc:
log = {}
ds = d['ds.short_form']
dataset_status = True
print("\n")
print("Testing: " + ds)
final_clauses = " WHERE ds.short_form = '%s' RETURN COUNT (DISTINCT i) as ind_count" \
", COLLECT(i.short_form) as ind_list" % ds
base_query = "MATCH (ds:DataSet)<-[:has_source]-(i:Individual)"
new_base_query = "MATCH (ds:DataSet)<-[:Annotation { short_form: 'source'}]-(i:Individual)"
if query_ind_count(base_query + final_clauses) == 0:
if query_ind_count(new_base_query + final_clauses):
base_query = new_base_query
print("Using new schema for tests.")
failing_individuals = list(set(r1['ind_list']) - set(r2['ind_list']))
return {
'description': description,
'failed_individuals': failing_individuals,
'total_indv_count': r1['ind_count'],
'compliant_indv_count': r2['ind_count'],
'failed_indv_count': len(failing_individuals),
'query': query2
}


def print_ratio_summary(desc, value, total):
print(desc + ":" + "".ljust(22-len(desc)) + str(value) + "/" + str(total) + " \t(" + str(int(100 * value / total)) + "%)")


def dump_report_to_file(test_rpt):
with open("kb_test.report", 'w') as report:
report.write(json.dumps(test_rpt, indent=2))


def test_kb(server, user, password, ds_short_forms, silent_mode):
nc = neo4j_connect(server, user, password)

datasets = nc.commit_list(
[get_ds_query(ds_short_forms)]) # removed "WHERE ds.schema = 'image'" as not in kb2
# ["MATCH (ds:DataSet) RETURN ds.short_form LIMIT 8"])
dc = results_2_dict_list(datasets)
test_report = dict()
test_report["failed_datasets"] = list()
test_report["empty_datasets"] = list()
test_report["successful_datasets"] = list()
test_count = 0
failed_test_count = 0

test_progress = tqdm(dc, desc='Test Progress', total=len(dc), bar_format='{l_bar}{bar:20}| {n_fmt}/{total_fmt}')
for d in test_progress:
ds = d['ds.short_form']
final_clauses = " WHERE ds.short_form = '%s' RETURN COUNT (DISTINCT i) as ind_count" \
", COLLECT(i.short_form) as ind_list" % ds
base_query = "MATCH (ds:DataSet)<-[:has_source]-(i:Individual)"
new_base_query = "MATCH (ds:DataSet)<-[:Annotation { short_form: 'source'}]-(i:Individual)"
if query_ind_count(nc, base_query + final_clauses) == 0:
if query_ind_count(nc, new_base_query + final_clauses):
base_query = new_base_query
print("Using new schema for tests.")
else:
test_report["empty_datasets"].append(ds)
continue
query1 = base_query + final_clauses
extended_base_query = base_query + "<-[:depicts]-(j:Individual)"

tests = list()
tests.append({'query': extended_base_query + final_clauses,
'description': "All anatomical individuals in dataset have matching channel individuals.",
'name': 'matching_channel_test'})

tests.append({'query': extended_base_query + "-[in_register_with]->(k:Individual)" + final_clauses,
'description': "All anatomical individuals in dataset have matching registered channel individuals.",
'name': 'registered_channel_test'})

tests.append({'query': extended_base_query + "-[:is_specified_output_of]->(:Class)" + final_clauses,
'description': "All anatomical individuals in dataset have matching channel individuals with imaging method.",
'name': 'matching_channel_with_imaging_method_test'})

tests.append({'query': extended_base_query + "-[:INSTANCEOF]->(c:Class { label: 'channel'})" + final_clauses,
'description': "All anatomical individuals in dataset have matching channel, typed individuals",
'name': 'matching_channel_typed_individuals_test'})

tests.append({'query': base_query + "-[:INSTANCEOF]->(c:Class)" + final_clauses,
'description': "All anatomical individuals in dataset are typed.",
'name': 'typed_datasets_test'})

failed_tests = dict()
for test in tests:
result = compare(nc, description=test['description'], query1=query1, query2=test['query'])
test_count += 1
if result:
failed_tests[test['name']] = result
failed_test_count += 1

if failed_tests:
test_report["failed_datasets"].append({
'dataset': ds,
'failed_tests': failed_tests
})
else:
print("This dataset has no content")
continue
query1 = base_query + final_clauses
extended_base_query = base_query + "<-[:depicts]-(j:Individual)"
query2 = extended_base_query + final_clauses
query3 = extended_base_query + "-[in_register_with]->(k:Individual)" + final_clauses
query4 = extended_base_query + "-[:is_specified_output_of]->(:Class)" + final_clauses
query5 = extended_base_query + "-[:INSTANCEOF]->(c:Class { label: 'channel'})" + final_clauses
query6 = base_query + "-[:INSTANCEOF]->(c:Class)" + final_clauses

test_stats = []

test_stats.append(compare(dataset=ds,
description="All anatomical individuals in dataset have matching channel individuals.",
query1=query1,
query2=query2,
log=log))
test_stats.append(
compare(description="All anatomical individuals in dataset have matching registered channel individuals.",
dataset=ds,
query1=query1,
query2=query3,
log=log))
test_stats.append(compare(
description="All anatomical individuals in dataset have matching channel individuals with imaging method",
dataset=ds,
query1=query1,
query2=query4,
log=log))
test_stats.append(
compare(description="All anatomical individuals in dataset have matching channel, typed individuals",
dataset=ds,
query1=query1,
query2=query5,
log=log))
test_stats.append(compare(description="All anatomical individuals in dataset are typed",
dataset=ds,
query1=query1,
query2=query6,
log=log))
if False in test_stats:
return_state = False
with open(ds + ".report", 'w') as report:
report.write(json.dumps(log))
test_report["successful_datasets"].append(ds)

test_report["summary"] = {
'dataset_count': len(dc),
'failed_dataset_count': len(test_report["failed_datasets"]),
'empty_dataset_count': len(test_report["empty_datasets"]),
'successful_datasets_count': len(test_report["successful_datasets"]),
'total_tests_run': test_count,
'failed_tests_count': failed_test_count
}
dump_report_to_console(test_report)
dump_report_to_file(test_report)
if (test_report["failed_datasets"] or test_report["empty_datasets"]) and not silent_mode:
sys.exit(1)


def get_ds_query(ds_short_forms):
if ds_short_forms:
q = "MATCH(ds: DataSet) WHERE ANY(sf IN ds.short_form WHERE sf IN {}) RETURN ds.short_form"\
.format(str(ds_short_forms))
else:
print("Passes!")

if not return_state and not silent_mode:
sys.exit(1)
q = "MATCH (ds:DataSet) RETURN ds.short_form"
return q


def dump_report_to_console(test_rpt):
if "summary" not in test_rpt:
print("\nNo tests run!")
return
print("")
summary = test_rpt["summary"]

print_ratio_summary("Failed Tests", summary["failed_tests_count"], summary["total_tests_run"])
print_ratio_summary("Failed Datasets", summary["failed_dataset_count"], summary["dataset_count"])
print_ratio_summary("Empty Datasets", summary["empty_dataset_count"], summary["dataset_count"])
print_ratio_summary("Successful Datasets", summary["successful_datasets_count"], summary["dataset_count"])

print("")
failed_datasets = test_rpt["failed_datasets"]
if failed_datasets:
print("=== Failed Datasets:\n")
for failed_dataset in failed_datasets:
print("Dataset: " + failed_dataset["dataset"] + "\n")
ds_failed_tests = failed_dataset["failed_tests"]
for test_name in ds_failed_tests:
failed_test = ds_failed_tests[test_name]
print("\tTesting assertion: " + failed_test["description"])
print("\t{} of total {} individuals failed. First 5 failing individuals are: {}"
.format(str(failed_test["failed_indv_count"]),
str(failed_test["total_indv_count"]),
str(failed_test["failed_individuals"][:5])))
print("\t{}\n".format(failed_test["query"]))

if test_rpt["empty_datasets"]:
print("=== Empty Datasets:\n")
for empty_dataset in test_rpt["empty_datasets"]:
print(" - " + empty_dataset)

if not failed_datasets and not test_rpt["empty_datasets"]:
print("All dataset tests are successful.")

# KB <-> prod check numbers
24 changes: 24 additions & 0 deletions src/uk/ac/ebi/vfb/neo4j/neo2neo/kb_tests_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import argparse
from uk.ac.ebi.vfb.neo4j.neo2neo.kb_tests import test_kb


def main():
parser = argparse.ArgumentParser(prog="kb_test", description='VFB KB consistency tester cli interface.')
parser.add_argument('-k', '--kb', action='store', required=True, help='KB server URI')
parser.add_argument('-u', '--user', action='store', required=True, help='KB server user')
parser.add_argument('-p', '--password', action='store', required=True, help='KB server password')
parser.add_argument('-s', '--silent', action='store_true', help='Activates silent mode that prevents abnormal exit.')
parser.add_argument('-d', '--dataset', action='append', nargs='*', help='Short form of the dataset to test.')

args = parser.parse_args()

datasets = list()
if 'dataset' in args and args.dataset:
# handle both '-d x -d y' and '-d x y'
datasets = [item for sublist in args.dataset for item in sublist]

test_kb(args.kb, args.user, args.password, datasets, args.silent)


if __name__ == "__main__":
main()