From 0e6259a18e20c504a8858d7ced93d1358d55a5f1 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 9 Oct 2024 19:17:26 +0300 Subject: [PATCH] Filter Python CD index results with queries This is more efficient than a single DELETE statement. --- examples/cdindex/cd5index-all-py.sql | 6 ++++++ examples/cdindex/cdindex-db.py | 14 -------------- examples/cdindex/valid_cd5index.sql | 9 +++++++++ 3 files changed, 15 insertions(+), 14 deletions(-) create mode 100644 examples/cdindex/cd5index-all-py.sql create mode 100644 examples/cdindex/valid_cd5index.sql diff --git a/examples/cdindex/cd5index-all-py.sql b/examples/cdindex/cd5index-all-py.sql new file mode 100644 index 0000000..d79f469 --- /dev/null +++ b/examples/cdindex/cd5index-all-py.sql @@ -0,0 +1,6 @@ +-- Export the CD5 index for works where one can be calculated + +CREATE INDEX IF NOT EXISTS rolap.valid_cd5index_doi_idx ON valid_cd5index(doi); + +SELECT doi, cdindex FROM rolap.cdindex + INNER JOIN rolap.valid_cd5index USING(doi);; diff --git a/examples/cdindex/cdindex-db.py b/examples/cdindex/cdindex-db.py index a01689c..bdb2392 100644 --- a/examples/cdindex/cdindex-db.py +++ b/examples/cdindex/cdindex-db.py @@ -152,18 +152,4 @@ def process_batch(start): db.commit() - db.execute(""" - -- Works and references - ATTACH 'cdindex.db' AS wr; - - DELETE FROM cdindex - WHERE doi NOT IN ( - SELECT cdindex.doi FROM cdindex - INNER JOIN wr.works USING(doi) - WHERE works.published_year <= 2018 OR - (SELECT 1 FROM work_references WHERE work_id == works.id) - ); - """) - perf.log("Remove invalid records") - db.close() diff --git a/examples/cdindex/valid_cd5index.sql b/examples/cdindex/valid_cd5index.sql new file mode 100644 index 0000000..a637323 --- /dev/null +++ b/examples/cdindex/valid_cd5index.sql @@ -0,0 +1,9 @@ +-- Create a table of publications for which a valid CD5 index can be calculated + +CREATE INDEX IF NOT EXISTS rolap.cdindex_doi_idx ON cdindex(doi); + +CREATE TABLE rolap.valid_cd5index AS + SELECT cdindex.doi FROM rolap.cdindex + INNER JOIN works USING(doi) + WHERE works.published_year <= 2018 AND + (SELECT 1 FROM work_references WHERE work_id == works.id);