VirtualFlyBrain · hkir-dev · Jul 29, 2022 · Aug 2, 2022 · Aug 3, 2022
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ pandasql
 pysolr
 base36
 vfb_connect
-better_profanity
+better_profanity
+tqdm
diff --git a/src/uk/ac/ebi/vfb/neo4j/neo2neo/kb_tests.py b/src/uk/ac/ebi/vfb/neo4j/neo2neo/kb_tests.py
@@ -2,16 +2,10 @@
 import sys
 import warnings
 import json
+from tqdm import tqdm
 
-nc = neo4j_connect(sys.argv[1], sys.argv[2], sys.argv[3])
 
-silent_mode = False
-# prevents sys.exit(1) on failure, just silently logs the result and exits
-if len(sys.argv) > 4 and sys.argv[4] == 'silent_fail':
-    silent_mode = True
-
-
-def query(query):
+def query(nc, query):
     q = nc.commit_list([query])
     if not q:
         return False
@@ -22,7 +16,7 @@ def query(query):
         return dc
 
 
-def query_ind_count(query):
+def query_ind_count(nc, query):
     q = nc.commit_list([query])
     if not q:
         return False
@@ -36,92 +30,157 @@ def query_ind_count(query):
         return dc[0]['ind_count']
 
 
-def compare(dataset: str, description: str, query1: str, query2: str, log: dict, verbose=False):
-    r1 = query(query1)[0]
-    r2 = query(query2)[0]
+def compare(nc, description: str, query1: str, query2: str, verbose=False):
+    r1 = query(nc, query1)[0]
+    r2 = query(nc, query2)[0]
     if r1['ind_count'] == r2['ind_count']:
-        if verbose:
-            print(query2)
-            print("Testing assertion:" + description)
-            print("Result: True")
-        return True
+        return None
     else:
-        print("Testing assertion:" + description)
-        print(query2)
-        print("Result: inds_in_datset: %d ; Compliant with pattern: %d" % (r1['ind_count'], r2['ind_count']))
-        log[description + '. Failing Inds'] = list(set(r1['ind_list']) - set(r2['ind_list']))
-        return False
-
-
-datasets = nc.commit_list(
-    ["MATCH (ds:DataSet) RETURN ds.short_form"])  # removed "WHERE ds.schema = 'image'" as not in kb2
-dc = results_2_dict_list(datasets)
-
-return_state = True
-
-for d in dc:
-    log = {}
-    ds = d['ds.short_form']
-    dataset_status = True
-    print("\n")
-    print("Testing: " + ds)
-    final_clauses = " WHERE ds.short_form = '%s' RETURN COUNT (DISTINCT i) as ind_count" \
-                    ", COLLECT(i.short_form) as ind_list" % ds
-    base_query = "MATCH (ds:DataSet)<-[:has_source]-(i:Individual)"
-    new_base_query = "MATCH (ds:DataSet)<-[:Annotation { short_form: 'source'}]-(i:Individual)"
-    if query_ind_count(base_query + final_clauses) == 0:
-        if query_ind_count(new_base_query + final_clauses):
-            base_query = new_base_query
-            print("Using new schema for tests.")
+        failing_individuals = list(set(r1['ind_list']) - set(r2['ind_list']))
+        return {
+            'description': description,
+            'failed_individuals': failing_individuals,
+            'total_indv_count': r1['ind_count'],
+            'compliant_indv_count': r2['ind_count'],
+            'failed_indv_count': len(failing_individuals),
+            'query': query2
+        }
+
+
+def print_ratio_summary(desc, value, total):
+    print(desc + ":" + "".ljust(22-len(desc)) + str(value) + "/" + str(total) + "  \t(" + str(int(100 * value / total)) + "%)")
+
+
+def dump_report_to_file(test_rpt):
+    with open("kb_test.report", 'w') as report:
+        report.write(json.dumps(test_rpt, indent=2))
+
+
+def test_kb(server, user, password, ds_short_forms, silent_mode):
+    nc = neo4j_connect(server, user, password)
+
+    datasets = nc.commit_list(
+        [get_ds_query(ds_short_forms)])  # removed "WHERE ds.schema = 'image'" as not in kb2
+    # ["MATCH (ds:DataSet) RETURN ds.short_form LIMIT 8"])
+    dc = results_2_dict_list(datasets)
+    test_report = dict()
+    test_report["failed_datasets"] = list()
+    test_report["empty_datasets"] = list()
+    test_report["successful_datasets"] = list()
+    test_count = 0
+    failed_test_count = 0
+
+    test_progress = tqdm(dc, desc='Test Progress', total=len(dc), bar_format='{l_bar}{bar:20}| {n_fmt}/{total_fmt}')
+    for d in test_progress:
+        ds = d['ds.short_form']
+        final_clauses = " WHERE ds.short_form = '%s' RETURN COUNT (DISTINCT i) as ind_count" \
+                        ", COLLECT(i.short_form) as ind_list" % ds
+        base_query = "MATCH (ds:DataSet)<-[:has_source]-(i:Individual)"
+        new_base_query = "MATCH (ds:DataSet)<-[:Annotation { short_form: 'source'}]-(i:Individual)"
+        if query_ind_count(nc, base_query + final_clauses) == 0:
+            if query_ind_count(nc, new_base_query + final_clauses):
+                base_query = new_base_query
+                print("Using new schema for tests.")
+            else:
+                test_report["empty_datasets"].append(ds)
+                continue
+        query1 = base_query + final_clauses
+        extended_base_query = base_query + "<-[:depicts]-(j:Individual)"
+
+        tests = list()
+        tests.append({'query': extended_base_query + final_clauses,
+                      'description': "All anatomical individuals in dataset have matching channel individuals.",
+                      'name': 'matching_channel_test'})
+
+        tests.append({'query': extended_base_query + "-[in_register_with]->(k:Individual)" + final_clauses,
+                      'description': "All anatomical individuals in dataset have matching registered channel individuals.",
+                      'name': 'registered_channel_test'})
+
+        tests.append({'query': extended_base_query + "-[:is_specified_output_of]->(:Class)" + final_clauses,
+                      'description': "All anatomical individuals in dataset have matching channel individuals with imaging method.",
+                      'name': 'matching_channel_with_imaging_method_test'})
+
+        tests.append({'query': extended_base_query + "-[:INSTANCEOF]->(c:Class { label: 'channel'})" + final_clauses,
+                      'description': "All anatomical individuals in dataset have matching channel, typed individuals",
+                      'name': 'matching_channel_typed_individuals_test'})
+
+        tests.append({'query': base_query + "-[:INSTANCEOF]->(c:Class)" + final_clauses,
+                      'description': "All anatomical individuals in dataset are typed.",
+                      'name': 'typed_datasets_test'})
+
+        failed_tests = dict()
+        for test in tests:
+            result = compare(nc, description=test['description'], query1=query1, query2=test['query'])
+            test_count += 1
+            if result:
+                failed_tests[test['name']] = result
+                failed_test_count += 1
+
+        if failed_tests:
+            test_report["failed_datasets"].append({
+                'dataset': ds,
+                'failed_tests': failed_tests
+            })
         else:
-            print("This dataset has no content")
-            continue
-    query1 = base_query + final_clauses
-    extended_base_query = base_query + "<-[:depicts]-(j:Individual)"
-    query2 = extended_base_query + final_clauses
-    query3 = extended_base_query + "-[in_register_with]->(k:Individual)" + final_clauses
-    query4 = extended_base_query + "-[:is_specified_output_of]->(:Class)" + final_clauses
-    query5 = extended_base_query + "-[:INSTANCEOF]->(c:Class { label: 'channel'})" + final_clauses
-    query6 = base_query + "-[:INSTANCEOF]->(c:Class)" + final_clauses
-
-    test_stats = []
-
-    test_stats.append(compare(dataset=ds,
-                              description="All anatomical individuals in dataset have matching channel individuals.",
-                              query1=query1,
-                              query2=query2,
-                              log=log))
-    test_stats.append(
-        compare(description="All anatomical individuals in dataset have matching registered channel individuals.",
-                dataset=ds,
-                query1=query1,
-                query2=query3,
-                log=log))
-    test_stats.append(compare(
-        description="All anatomical individuals in dataset have matching channel individuals with imaging method",
-        dataset=ds,
-        query1=query1,
-        query2=query4,
-        log=log))
-    test_stats.append(
-        compare(description="All anatomical individuals in dataset have matching channel, typed individuals",
-                dataset=ds,
-                query1=query1,
-                query2=query5,
-                log=log))
-    test_stats.append(compare(description="All anatomical individuals in dataset are typed",
-                              dataset=ds,
-                              query1=query1,
-                              query2=query6,
-                              log=log))
-    if False in test_stats:
-        return_state = False
-        with open(ds + ".report", 'w') as report:
-            report.write(json.dumps(log))
+            test_report["successful_datasets"].append(ds)
+
+        test_report["summary"] = {
+            'dataset_count': len(dc),
+            'failed_dataset_count': len(test_report["failed_datasets"]),
+            'empty_dataset_count': len(test_report["empty_datasets"]),
+            'successful_datasets_count': len(test_report["successful_datasets"]),
+            'total_tests_run': test_count,
+            'failed_tests_count': failed_test_count
+        }
+    dump_report_to_console(test_report)
+    dump_report_to_file(test_report)
+    if (test_report["failed_datasets"] or test_report["empty_datasets"]) and not silent_mode:
+        sys.exit(1)
+
+
+def get_ds_query(ds_short_forms):
+    if ds_short_forms:
+        q = "MATCH(ds: DataSet) WHERE ANY(sf IN ds.short_form WHERE sf IN {}) RETURN ds.short_form"\
+            .format(str(ds_short_forms))
     else:
-        print("Passes!")
-
-if not return_state and not silent_mode:
-    sys.exit(1)
+        q = "MATCH (ds:DataSet) RETURN ds.short_form"
+    return q
+
+
+def dump_report_to_console(test_rpt):
+    if "summary" not in test_rpt:
+        print("\nNo tests run!")
+        return
+    print("")
+    summary = test_rpt["summary"]
+
+    print_ratio_summary("Failed Tests", summary["failed_tests_count"], summary["total_tests_run"])
+    print_ratio_summary("Failed Datasets", summary["failed_dataset_count"], summary["dataset_count"])
+    print_ratio_summary("Empty Datasets", summary["empty_dataset_count"], summary["dataset_count"])
+    print_ratio_summary("Successful Datasets", summary["successful_datasets_count"], summary["dataset_count"])
+
+    print("")
+    failed_datasets = test_rpt["failed_datasets"]
+    if failed_datasets:
+        print("=== Failed Datasets:\n")
+        for failed_dataset in failed_datasets:
+            print("Dataset: " + failed_dataset["dataset"] + "\n")
+            ds_failed_tests = failed_dataset["failed_tests"]
+            for test_name in ds_failed_tests:
+                failed_test = ds_failed_tests[test_name]
+                print("\tTesting assertion: " + failed_test["description"])
+                print("\t{} of total {} individuals failed. First 5 failing individuals are: {}"
+                      .format(str(failed_test["failed_indv_count"]),
+                              str(failed_test["total_indv_count"]),
+                              str(failed_test["failed_individuals"][:5])))
+                print("\t{}\n".format(failed_test["query"]))
+
+    if test_rpt["empty_datasets"]:
+        print("=== Empty Datasets:\n")
+        for empty_dataset in test_rpt["empty_datasets"]:
+            print("  - " + empty_dataset)
+
+    if not failed_datasets and not test_rpt["empty_datasets"]:
+        print("All dataset tests are successful.")
 
 # KB <-> prod check numbers
diff --git a/src/uk/ac/ebi/vfb/neo4j/neo2neo/kb_tests_runner.py b/src/uk/ac/ebi/vfb/neo4j/neo2neo/kb_tests_runner.py
@@ -0,0 +1,24 @@
+import argparse
+from uk.ac.ebi.vfb.neo4j.neo2neo.kb_tests import test_kb
+
+
+def main():
+    parser = argparse.ArgumentParser(prog="kb_test", description='VFB KB consistency tester cli interface.')
+    parser.add_argument('-k', '--kb', action='store', required=True, help='KB server URI')
+    parser.add_argument('-u', '--user', action='store', required=True, help='KB server user')
+    parser.add_argument('-p', '--password', action='store', required=True, help='KB server password')
+    parser.add_argument('-s', '--silent', action='store_true', help='Activates silent mode that prevents abnormal exit.')
+    parser.add_argument('-d', '--dataset', action='append', nargs='*', help='Short form of the dataset to test.')
+
+    args = parser.parse_args()
+
+    datasets = list()
+    if 'dataset' in args and args.dataset:
+        # handle both '-d x -d y' and '-d x y'
+        datasets = [item for sublist in args.dataset for item in sublist]
+
+    test_kb(args.kb, args.user, args.password, datasets, args.silent)
+
+
+if __name__ == "__main__":
+    main()