Got extraction working on sample and original cheque file (#70)

marlanperumal · Jan 16, 2025 · 125f714 · 125f714
1 parent 21768e8
commit 125f714
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -93,6 +93,8 @@ Once again for the default this will be
 
 The configuration file itself is in JSON format. Here's the Absa cheque account one with some commentary to explain what each field does.
 
+The dimensions to be supplied in the `area` and `columns` parameters are specified in pts, defined as 72 pts in 1 inch. For reference, letter size paper is 8.5 x 11.0 inches (612 x 792 pts) and A4 paper is 8.3 x 11.7 inches (597.6 x 842.4 pts). The origin (0, 0) is located at the top left corner of the page. This is probably most intuitive, however note that it is different to the PDF standard which places the origin at the *bottom* left of the page. 
+
 ```json5
 {
     "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
@@ -103,7 +105,7 @@ The configuration file itself is in JSON format. Here's the Absa cheque account
             // The page coordinates in containing the table in pts 
             // [top, left, bottom, right]
             "area": [280, 27, 763, 576],
-            // The right x coordinate of each column in the table
+            // The right x coordinate of each column in the table in pts
             "columns": [83, 264, 344, 425, 485, 570]
         },
         // Layout for the first page

diff --git a/pdf_statement_reader/config/za/absa/cheque-sample.json b/pdf_statement_reader/config/za/absa/cheque-sample.json
@@ -0,0 +1,40 @@
+{
+    "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
+    "layout": {
+        "default": {
+            "area": [183, 78, 731, 530],
+            "columns": [120, 289, 290, 330, 350, 415, 475, 530]
+        },
+        "first": {
+            "area": [419, 78, 731, 530],
+            "columns": [120, 289, 290, 330, 350, 415, 475, 530]
+        }
+    },
+    "columns": {
+        "trans_date": "Date",
+        "trans_type": "Transaction Description",
+        "trans_detail": "Transaction Detail",
+        "charge": "Charge",
+        "charge_type": "Charge Type",
+        "debit": "Debit Amount",
+        "credit": "Credit Amount",
+        "balance": "Balance"
+    },
+    "order": [
+        "trans_date",
+        "trans_type",
+        "trans_detail",
+        "charge",
+        "charge_type",
+        "debit",
+        "credit",
+        "balance"
+    ],
+    "cleaning": {
+        "numeric": ["charge", "debit", "credit", "balance"],
+        "date": ["trans_date"],
+        "date_format": "%d/%m/%Y",
+        "trans_detail": "below",
+        "dropna": ["balance"]
+    }
+}
diff --git a/pdf_statement_reader/config/za/absa/cheque.json b/pdf_statement_reader/config/za/absa/cheque.json
@@ -2,18 +2,19 @@
     "$schema": "https://raw.githubusercontent.com/marlanperumal/pdf_statement_reader/develop/pdf_statement_reader/config/psr_config.schema.json",
     "layout": {
         "default": {
-            "area": [280, 27, 763, 576],
-            "columns": [83, 264, 344, 425, 485, 570]
+            "area": [280, 27, 795, 576],
+            "columns": [83, 263, 264, 344, 425, 485, 570]
         },
         "first": {
             "area": [480, 27, 763, 576],
-            "columns": [83, 264, 344, 425, 485, 570]
+            "columns": [83, 263, 264, 344, 425, 485, 570]
         }
     },
     "columns": {
         "trans_date": "Date",
         "trans_type": "Transaction Description",
         "trans_detail": "Transaction Detail",
+        "charge": "Charge",
         "debit": "Debit Amount",
         "credit": "Credit Amount",
         "balance": "Balance"
@@ -22,6 +23,7 @@
         "trans_date",
         "trans_type",
         "trans_detail",
+        "charge",
         "debit",
         "credit",
         "balance"

diff --git a/pdf_statement_reader/parse.py b/pdf_statement_reader/parse.py
@@ -67,8 +67,6 @@ def clean_trans_detail(df, config):
     trans_type = config["columns"]["trans_type"]
     balance = config["columns"]["balance"]
 
-    df[trans_detail] = ""
-
     for i, row in df.iterrows():
         if i == 0:
             continue
@@ -82,8 +80,9 @@ def clean_dropna(df, config):
 
 
 def reorder_columns(df, config):
-    columns = [config["columns"][col] for col in config["order"]]
-    return df[columns]
+    column_mapper = {a: b for a, b in zip(df.columns, config["columns"].values())}
+    ordered_columns = [config["columns"][col] for col in config["order"]]
+    return df.rename(columns=column_mapper)[ordered_columns]
 
 
 def parse_statement(filename, config):
@@ -95,16 +94,16 @@ def parse_statement(filename, config):
     if "numeric" in config["cleaning"]:
         clean_numeric(statement, config)
 
-    if "trans_detail" in config["cleaning"]:
-        clean_trans_detail(statement, config)
-
     if "date" in config["cleaning"]:
         clean_date(statement, config)
 
-    if "dropna" in config["cleaning"]:
-        clean_dropna(statement, config)
-
     if "order" in config:
         statement = reorder_columns(statement, config)
 
+    if "trans_detail" in config["cleaning"]:
+        clean_trans_detail(statement, config)
+
+    if "dropna" in config["cleaning"]:
+        clean_dropna(statement, config)
+
     return statement
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 urls = { "issues" = "https://github.com/marlanperumal/pdf_statement_reader/issues", "source" = "https://github.com/marlanperumal/pdf_statement_reader", "homepage" = "https://github.com/marlanperumal/pdf_statement_reader" }
 name = "pdf-statement-reader"
-version = "0.3.2"
+version = "0.3.4"
 description = "PDF Statement Reader"
 keywords = ["pdf", "statement", "reader", "bank statement", "digitise"]
 authors = [{ name = "Marlan Perumal", email = "[email protected]" }]

diff --git a/uv.lock b/uv.lock