From 6d0546ae11dfe9fe75da89d212061b87e456f106 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 5 Feb 2016 20:49:09 -0500 Subject: [PATCH 01/63] Here's a script for backfilling routes & exchanges --- backfill.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++ sql/branch.sql | 4 +++ 2 files changed, 75 insertions(+) create mode 100755 backfill.py create mode 100644 sql/branch.sql diff --git a/backfill.py b/backfill.py new file mode 100755 index 0000000000..425b75cca9 --- /dev/null +++ b/backfill.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python2 -u +"""Script for backfilling exchange status, route, and ref. + +Symlink a directory of data at `./backfill` and then call like so: + + [gratipay] $ run_dammit defaults.env local.env -c env/bin/python backfill.py + +Data files should be one per network (named `samurai`, `stripe`, etc), as CSVs +with these columns: + + user_id required Gratipay participant.id + username ignored + address optional defaults to 'fake-deadbeef' + exchange_id required Gratipay exchanges.id + status optional defaults to 'succeeded' + ref optional defaults to 'fake-beeffeed' + +For successfully backfilled exchanges (and routes), the script outputs the same +CSV as was input, with optional fields filled in. The script is idempotent (the +faked address and ref are hashed from other input values). + +""" +from __future__ import absolute_import, division, print_function, unicode_literals + +import csv +import os +import sha +import sys +from os import path + +from gratipay import wireup +from gratipay.models.exchange_route import ExchangeRoute +from gratipay.models.participant import Participant + + +BASE = path.dirname(__file__) + + +def fake(*a): + return 'fake-' + sha.new(''.join(map(str, a))).hexdigest() + + +def link(db, log, user_id, network, address, exchange_id, status, ref): + participant = Participant.from_id(user_id) + route = ExchangeRoute.from_network(participant, network) + if route is None: + route = ExchangeRoute.insert(participant, network, address) + db.run( "UPDATE exchanges SET status=%s, route=%s, ref=%s WHERE id=%s" + , (status, route.id, ref, exchange_id) + ) + log(participant.id, participant.username, address, exchange_id, status, ref ) + + +def main(db, log): + for network in os.listdir('backfill'): + data = csv.reader(open(path.join('backfill', network))) + for user_id, _, address, exchange_id, status, ref in data: + assert user_id + address = address or fake(user_id, network) + assert exchange_id + status = status or 'succeeded' + ref = ref or fake(user_id, network, exchange_id) + + link(db, log, user_id, network, address, exchange_id, status, ref) + + +if __name__ == '__main__': + db = wireup.db(wireup.env()) + writer = csv.writer(sys.stdout) + log = lambda *a: writer.writerow(a) + main(db, log) diff --git a/sql/branch.sql b/sql/branch.sql new file mode 100644 index 0000000000..74a9f6fd6e --- /dev/null +++ b/sql/branch.sql @@ -0,0 +1,4 @@ +ALTER TYPE payment_net ADD VALUE 'samurai'; +ALTER TYPE payment_net ADD VALUE 'stripe'; + +ALTER TABLE exchanges ADD UNIQUE (ref); From 5e1d16c4a73978eedbd47c109dbf16ea80faf6ad Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Sat, 6 Feb 2016 09:27:19 -0500 Subject: [PATCH 02/63] Scope ref uniqueness to network --- sql/branch.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/branch.sql b/sql/branch.sql index 74a9f6fd6e..93336aeae9 100644 --- a/sql/branch.sql +++ b/sql/branch.sql @@ -1,4 +1,4 @@ ALTER TYPE payment_net ADD VALUE 'samurai'; ALTER TYPE payment_net ADD VALUE 'stripe'; -ALTER TABLE exchanges ADD UNIQUE (ref); +ALTER TABLE exchanges ADD UNIQUE (network, ref); From 9358207bfe21b0fd835354b0b30b84452362eb8c Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 11 Feb 2016 07:49:52 -0500 Subject: [PATCH 03/63] Drop unique constraint for now It's more complicated, because we don't have `network` in the `exachanges` table, only in `exchange_routes`. --- sql/branch.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/branch.sql b/sql/branch.sql index 93336aeae9..e8c38225d7 100644 --- a/sql/branch.sql +++ b/sql/branch.sql @@ -1,4 +1,2 @@ ALTER TYPE payment_net ADD VALUE 'samurai'; ALTER TYPE payment_net ADD VALUE 'stripe'; - -ALTER TABLE exchanges ADD UNIQUE (network, ref); From 6ad00b365742450d2829d2950cf153cc90071727 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 11 Feb 2016 07:53:25 -0500 Subject: [PATCH 04/63] Tweak CSV format user_id and address are conceptually related, as are exchange_id and ref --- backfill.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/backfill.py b/backfill.py index 425b75cca9..bf9a737603 100755 --- a/backfill.py +++ b/backfill.py @@ -8,12 +8,12 @@ Data files should be one per network (named `samurai`, `stripe`, etc), as CSVs with these columns: - user_id required Gratipay participant.id username ignored + user_id required Gratipay participant.id address optional defaults to 'fake-deadbeef' exchange_id required Gratipay exchanges.id - status optional defaults to 'succeeded' ref optional defaults to 'fake-beeffeed' + status required Gratipay exchanges.status: succeeded, failed, pending For successfully backfilled exchanges (and routes), the script outputs the same CSV as was input, with optional fields filled in. The script is idempotent (the @@ -40,7 +40,7 @@ def fake(*a): return 'fake-' + sha.new(''.join(map(str, a))).hexdigest() -def link(db, log, user_id, network, address, exchange_id, status, ref): +def link(db, log, network, user_id, address, exchange_id, ref, status): participant = Participant.from_id(user_id) route = ExchangeRoute.from_network(participant, network) if route is None: @@ -48,20 +48,21 @@ def link(db, log, user_id, network, address, exchange_id, status, ref): db.run( "UPDATE exchanges SET status=%s, route=%s, ref=%s WHERE id=%s" , (status, route.id, ref, exchange_id) ) - log(participant.id, participant.username, address, exchange_id, status, ref ) + log(participant.username, participant.id, address, exchange_id, ref, status) def main(db, log): for network in os.listdir('backfill'): + if network.startswith('_'): continue data = csv.reader(open(path.join('backfill', network))) - for user_id, _, address, exchange_id, status, ref in data: + for _, user_id, address, exchange_id, ref, status in data: assert user_id - address = address or fake(user_id, network) + address = address or fake(network, user_id) assert exchange_id - status = status or 'succeeded' - ref = ref or fake(user_id, network, exchange_id) + ref = ref or fake(network, user_id, exchange_id) + assert status - link(db, log, user_id, network, address, exchange_id, status, ref) + link(db, log, network, user_id, address, exchange_id, ref, status) if __name__ == '__main__': From d5605b5290af41c1c0a4a2ca21c5369d32f9c6b1 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 11 Feb 2016 07:55:35 -0500 Subject: [PATCH 05/63] Add a matchup script This takes an export from a payment network (Stripe so far) and attempts to find an exchange to go with it. --- matchup.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100755 matchup.py diff --git a/matchup.py b/matchup.py new file mode 100755 index 0000000000..37aff20c49 --- /dev/null +++ b/matchup.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python2 +from __future__ import absolute_import, division, print_function, unicode_literals + +import csv + +from gratipay import wireup + + +def find(db, rec): + exact = db.one(""" + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE "timestamp" - %(Created)s < '60 seconds'::interval + AND amount + fee = %(Amount)s + AND amount > 0 + AND participant = %(Description)s + + """, rec) + if exact: + out = [exact] + else: + out = db.all(""" + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE "timestamp" - %(Created)s < '60 seconds'::interval + AND amount + fee = %(Amount)s + AND amount > 0 + + """, rec) + return out + + +def main(db): + reader = csv.reader(open('backfill/_stripe-transfers.csv')) + writer = csv.writer(open('backfill/stripe', 'w')) + headers = next(reader) + for row in reader: + rec = dict(zip(headers, row)) + matches = find(db, rec) + for match in matches: + writer.writerow([ match.participant + , match.user_id + , '' + , match.id + , rec['ID'] + , 'succeeded' + ]) + if match.participant != rec['Description']: + print(rec['Description'], '=>', match.participant) + + +if __name__ == '__main__': + db = wireup.db(wireup.env()) + main(db) From ed4dd5737ea7aea4662eb9d65b39ee4593de7597 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 11 Feb 2016 08:06:03 -0500 Subject: [PATCH 06/63] Log network with backfill --- backfill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backfill.py b/backfill.py index bf9a737603..6e81526947 100755 --- a/backfill.py +++ b/backfill.py @@ -48,7 +48,7 @@ def link(db, log, network, user_id, address, exchange_id, ref, status): db.run( "UPDATE exchanges SET status=%s, route=%s, ref=%s WHERE id=%s" , (status, route.id, ref, exchange_id) ) - log(participant.username, participant.id, address, exchange_id, ref, status) + log(network, participant.username, participant.id, address, exchange_id, ref, status) def main(db, log): From a5b8254ad8fc72af807675de290559bfd5e93834 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 12 Feb 2016 09:09:18 -0500 Subject: [PATCH 07/63] Update Stripe match script for payments.csv --- match-stripe.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++++ matchup.py | 60 --------------------- 2 files changed, 140 insertions(+), 60 deletions(-) create mode 100755 match-stripe.py delete mode 100755 matchup.py diff --git a/match-stripe.py b/match-stripe.py new file mode 100755 index 0000000000..1d998156bc --- /dev/null +++ b/match-stripe.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python2 +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import csv + +from gratipay import wireup + + +def find(db, rec): + return db.one(""" + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE "timestamp" - %(Created)s < '60 seconds'::interval + AND amount + fee = %(Amount)s + AND amount > 0 + AND participant = %(Description)s + + """, rec) + + +def fuzz(db, rec): + import pdb; pdb.set_trace() + return db.all(""" + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE "timestamp" - %(Created)s < '60 seconds'::interval + AND amount + fee = %(Amount)s + AND amount > 0 + + """, rec) + + +def process_month(db, year, month): + reader = csv.reader(open('3912/{}/{}/_stripe-payments.csv'.format(year, month))) + writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w')) + + def emit(match, rec): + writer.writerow([ match.participant + , match.user_id + , rec['Customer ID'] + , match.id + , rec['id'] + , rec['Status'] + ]) + + headers = next(reader) + matched = [] + inexact = [] + + for row in reader: + rec = dict(zip(headers, row)) + rec['Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier + + exact = find(db, rec) + if exact: + emit(exact, rec) + matched.append(exact.user_id) + else: + inexact.append(rec) + + for row in inexact: + fuzzed = fuzz(db, rec) + possible = [m for m in fuzzed if not m.id in matched] + assert len(possible) == 1, possible + guess = possible[0] + print(rec['Description'], '=>', guess.participant) + emit(guess, rec) + + +def main(db): + for year in os.listdir('3912'): + if not year.isdigit(): continue + for month in os.listdir('3912/' + year): + if not month.isdigit(): continue + process_month(db, year, month) + + +if __name__ == '__main__': + db = wireup.db(wireup.env()) + main(db) + + +""" +Fields in _stripe-payments.csv: + + id + Description + Created (UTC) + Amount + Amount Refunded + Currency + Converted Amount + Converted Amount Refunded + Fee + Tax + Converted Currency + Mode + Status + Statement Descriptor + Customer ID + Customer Description + Customer Email + Captured + Card ID + Card Last4 + Card Brand + Card Funding + Card Exp Month + Card Exp Year + Card Name + Card Address Line1 + Card Address Line2 + Card Address City + Card Address State + Card Address Country + Card Address Zip + Card Issue Country + Card Fingerprint + Card CVC Status + Card AVS Zip Status + Card AVS Line1 Status + Card Tokenization Method + Disputed Amount + Dispute Status + Dispute Reason + Dispute Date (UTC) + Dispute Evidence Due (UTC) + Invoice ID + Payment Source Type + Destination + Transfer + +""" diff --git a/matchup.py b/matchup.py deleted file mode 100755 index 37aff20c49..0000000000 --- a/matchup.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python2 -from __future__ import absolute_import, division, print_function, unicode_literals - -import csv - -from gratipay import wireup - - -def find(db, rec): - exact = db.one(""" - - SELECT e.*, p.id as user_id - FROM exchanges e - JOIN participants p - ON e.participant = p.username - WHERE "timestamp" - %(Created)s < '60 seconds'::interval - AND amount + fee = %(Amount)s - AND amount > 0 - AND participant = %(Description)s - - """, rec) - if exact: - out = [exact] - else: - out = db.all(""" - - SELECT e.*, p.id as user_id - FROM exchanges e - JOIN participants p - ON e.participant = p.username - WHERE "timestamp" - %(Created)s < '60 seconds'::interval - AND amount + fee = %(Amount)s - AND amount > 0 - - """, rec) - return out - - -def main(db): - reader = csv.reader(open('backfill/_stripe-transfers.csv')) - writer = csv.writer(open('backfill/stripe', 'w')) - headers = next(reader) - for row in reader: - rec = dict(zip(headers, row)) - matches = find(db, rec) - for match in matches: - writer.writerow([ match.participant - , match.user_id - , '' - , match.id - , rec['ID'] - , 'succeeded' - ]) - if match.participant != rec['Description']: - print(rec['Description'], '=>', match.participant) - - -if __name__ == '__main__': - db = wireup.db(wireup.env()) - main(db) From cd835e5aa2df7b1f2d8f677d24250293c9594320 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Tue, 16 Feb 2016 12:32:15 -0500 Subject: [PATCH 08/63] Clean up a couple things in match-stripe.py --- match-stripe.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index 1d998156bc..ece8d58f3a 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -23,7 +23,6 @@ def find(db, rec): def fuzz(db, rec): - import pdb; pdb.set_trace() return db.all(""" SELECT e.*, p.id as user_id @@ -39,7 +38,7 @@ def fuzz(db, rec): def process_month(db, year, month): reader = csv.reader(open('3912/{}/{}/_stripe-payments.csv'.format(year, month))) - writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w')) + writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w+')) def emit(match, rec): writer.writerow([ match.participant @@ -56,7 +55,7 @@ def emit(match, rec): for row in reader: rec = dict(zip(headers, row)) - rec['Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier + rec[b'Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier exact = find(db, rec) if exact: @@ -65,9 +64,9 @@ def emit(match, rec): else: inexact.append(rec) - for row in inexact: + for rec in inexact: fuzzed = fuzz(db, rec) - possible = [m for m in fuzzed if not m.id in matched] + possible = [m for m in fuzzed if not m.user_id in matched] assert len(possible) == 1, possible guess = possible[0] print(rec['Description'], '=>', guess.participant) From 5e965f9d1c73017b3dfc04e2c39ded735dba0e2c Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 18 Feb 2016 08:08:20 -0500 Subject: [PATCH 09/63] Emit rows in original order --- match-stripe.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index ece8d58f3a..5a2c934227 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -40,27 +40,22 @@ def process_month(db, year, month): reader = csv.reader(open('3912/{}/{}/_stripe-payments.csv'.format(year, month))) writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w+')) - def emit(match, rec): - writer.writerow([ match.participant - , match.user_id - , rec['Customer ID'] - , match.id - , rec['id'] - , rec['Status'] - ]) - headers = next(reader) matched = [] + rec2mat = {} inexact = [] + ordered = [] for row in reader: rec = dict(zip(headers, row)) rec[b'Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier - exact = find(db, rec) - if exact: - emit(exact, rec) - matched.append(exact.user_id) + ordered.append(rec) + + match = find(db, rec) + if match: + matched.append(match.user_id) + rec2mat[rec['id']] = match else: inexact.append(rec) @@ -70,7 +65,17 @@ def emit(match, rec): assert len(possible) == 1, possible guess = possible[0] print(rec['Description'], '=>', guess.participant) - emit(guess, rec) + rec2mat[rec['id']] = guess + + for rec in ordered: + match = rec2mat[rec['id']] + writer.writerow([ match.participant + , match.user_id + , rec['Customer ID'] + , match.id + , rec['id'] + , rec['Status'] + ]) def main(db): From ab37c09c4075a4eb6c4baafba9c9191ac45564e7 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 18 Feb 2016 17:12:04 -0500 Subject: [PATCH 10/63] Ignore the test transaction --- match-stripe.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/match-stripe.py b/match-stripe.py index 5a2c934227..a1907b071c 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -50,6 +50,9 @@ def process_month(db, year, month): rec = dict(zip(headers, row)) rec[b'Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier + if rec['id'] == 'ch_Pi3yBdmevsIr5q': + continue # special-case the first test transaction + ordered.append(rec) match = find(db, rec) From b122860152730fdd9dd937586e4e9402d81dbc36 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 18 Feb 2016 17:12:13 -0500 Subject: [PATCH 11/63] Add some debugging aids --- match-stripe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/match-stripe.py b/match-stripe.py index a1907b071c..f0e1ca84af 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -8,6 +8,7 @@ def find(db, rec): + print("find", rec['Description']) return db.one(""" SELECT e.*, p.id as user_id @@ -23,6 +24,7 @@ def find(db, rec): def fuzz(db, rec): + print("fuzz", rec['Description']) return db.all(""" SELECT e.*, p.id as user_id @@ -46,6 +48,9 @@ def process_month(db, year, month): inexact = [] ordered = [] + header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) + + header("FIRST PASS") for row in reader: rec = dict(zip(headers, row)) rec[b'Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier @@ -62,6 +67,7 @@ def process_month(db, year, month): else: inexact.append(rec) + header("SECOND PASS") for rec in inexact: fuzzed = fuzz(db, rec) possible = [m for m in fuzzed if not m.user_id in matched] @@ -70,6 +76,7 @@ def process_month(db, year, month): print(rec['Description'], '=>', guess.participant) rec2mat[rec['id']] = guess + header("THIRD PASS") for rec in ordered: match = rec2mat[rec['id']] writer.writerow([ match.participant From b1fd83b5cb3fe32656dabbd979bb011d51415439 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 3 Mar 2016 08:22:38 -0500 Subject: [PATCH 12/63] Improve match script so we don't exception --- match-stripe.py | 94 ++++++++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index f0e1ca84af..ec052bf915 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -1,41 +1,47 @@ #!/usr/bin/env python2 from __future__ import absolute_import, division, print_function, unicode_literals -import os import csv +import os +import sys from gratipay import wireup -def find(db, rec): - print("find", rec['Description']) - return db.one(""" +FUZZ = """ SELECT e.*, p.id as user_id FROM exchanges e JOIN participants p ON e.participant = p.username - WHERE "timestamp" - %(Created)s < '60 seconds'::interval + WHERE ( + ((("timestamp" - %(Created)s) < '0 seconds') AND + (("timestamp" - %(Created)s) > '-60 seconds')) + OR + (("timestamp" - %(Created)s) = '0 seconds') + OR + ((("timestamp" - %(Created)s) > '0 seconds') AND + (("timestamp" - %(Created)s) < '60 seconds')) + ) AND amount + fee = %(Amount)s AND amount > 0 + +""" +FIND = FUZZ + """ + AND participant = %(Description)s - """, rec) +""" -def fuzz(db, rec): - print("fuzz", rec['Description']) - return db.all(""" +def find(log, db, rec): + log("finding", rec['Description']) + return db.one(FIND, rec) - SELECT e.*, p.id as user_id - FROM exchanges e - JOIN participants p - ON e.participant = p.username - WHERE "timestamp" - %(Created)s < '60 seconds'::interval - AND amount + fee = %(Amount)s - AND amount > 0 - """, rec) +def fuzz(log, db, rec): + log("fuzzing", rec['Description'], end='') + return db.all(FUZZ, rec) def process_month(db, year, month): @@ -49,8 +55,9 @@ def process_month(db, year, month): ordered = [] header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) + log = lambda *a, **kw: print('{}-{}'.format(year, month), *a, **kw) - header("FIRST PASS") + header("FINDING") for row in reader: rec = dict(zip(headers, row)) rec[b'Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier @@ -58,47 +65,62 @@ def process_month(db, year, month): if rec['id'] == 'ch_Pi3yBdmevsIr5q': continue # special-case the first test transaction + if rec['Status'] == 'Failed': + print("{} is a FAILURE!!!!!!!".format(rec['Description'])) + continue # right? + ordered.append(rec) - match = find(db, rec) + match = find(log, db, rec) if match: matched.append(match.user_id) rec2mat[rec['id']] = match else: inexact.append(rec) - header("SECOND PASS") + header("FUZZING") for rec in inexact: - fuzzed = fuzz(db, rec) + fuzzed = fuzz(log, db, rec) possible = [m for m in fuzzed if not m.user_id in matched] - assert len(possible) == 1, possible - guess = possible[0] - print(rec['Description'], '=>', guess.participant) - rec2mat[rec['id']] = guess + npossible = len(possible) + print(' => ', end='') + if npossible > 1: + print(' OR '.join([p.participant for p in possible])) + elif npossible == 1: + guess = possible[0] + print(guess.participant) + rec2mat[rec['id']] = guess + else: + print('???', rec['Amount'], rec['Created']) - header("THIRD PASS") + header("WRITING") for rec in ordered: - match = rec2mat[rec['id']] - writer.writerow([ match.participant - , match.user_id - , rec['Customer ID'] - , match.id - , rec['id'] - , rec['Status'] - ]) + match = rec2mat.get(rec['id']) + if match is None: + log("skipping", rec['Description']) + else: + writer.writerow([ match.participant + , match.user_id + , rec['Customer ID'] + , match.id + , rec['id'] + , rec['Status'] + ]) -def main(db): +def main(db, constraint): for year in os.listdir('3912'): if not year.isdigit(): continue for month in os.listdir('3912/' + year): if not month.isdigit(): continue + if constraint and not '{}-{}'.format(year, month) == constraint: continue process_month(db, year, month) if __name__ == '__main__': db = wireup.db(wireup.env()) - main(db) + constraint = '' if len(sys.argv) < 2 else sys.argv[1] + main(db, constraint) """ From e9c5581ca2507f5bc68fec34e75dbf7fcfe43769 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 3 Mar 2016 18:26:00 -0500 Subject: [PATCH 13/63] Stripe's timestamp is only minute granularity --- match-stripe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index ec052bf915..ea806f4a84 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -16,12 +16,12 @@ ON e.participant = p.username WHERE ( ((("timestamp" - %(Created)s) < '0 seconds') AND - (("timestamp" - %(Created)s) > '-60 seconds')) + (("timestamp" - %(Created)s) > '-61 seconds')) OR (("timestamp" - %(Created)s) = '0 seconds') OR ((("timestamp" - %(Created)s) > '0 seconds') AND - (("timestamp" - %(Created)s) < '60 seconds')) + (("timestamp" - %(Created)s) < '61 seconds')) ) AND amount + fee = %(Amount)s AND amount > 0 From dd3ab2f880281027c9f8844189a4d5f3637d96f6 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 3 Mar 2016 18:45:23 -0500 Subject: [PATCH 14/63] Smarten up fuzz guessing --- match-stripe.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index ea806f4a84..db2c741e6c 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -53,6 +53,7 @@ def process_month(db, year, month): rec2mat = {} inexact = [] ordered = [] + fuz2mat = {} header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) log = lambda *a, **kw: print('{}-{}'.format(year, month), *a, **kw) @@ -80,18 +81,30 @@ def process_month(db, year, month): header("FUZZING") for rec in inexact: + guess = fuz2mat.get(rec['Description']) + fuzzed = fuzz(log, db, rec) possible = [m for m in fuzzed if not m.user_id in matched] npossible = len(possible) print(' => ', end='') - if npossible > 1: - print(' OR '.join([p.participant for p in possible])) + + match = None + if npossible == 0: + print('???', rec['Amount'], rec['Created']) # should log "skipping" below elif npossible == 1: - guess = possible[0] - print(guess.participant) - rec2mat[rec['id']] = guess + match = possible[0] + if rec['Description'] in fuz2mat: + print('(again) ', end='') + else: + fuz2mat[rec['Description']] = match else: - print('???', rec['Amount'], rec['Created']) + match = {m.participant:m for m in possible}.get(guess.participant) + if not match: + print(' OR '.join([p.participant for p in possible])) + + if match: + print(match.participant) + rec2mat[rec['id']] = match header("WRITING") for rec in ordered: From e2ccc3a4b7cb9063264b65e9f68bbd8bc2cbb0d4 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 3 Mar 2016 18:46:29 -0500 Subject: [PATCH 15/63] Unbuffer output for easier tailing --- match-stripe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/match-stripe.py b/match-stripe.py index db2c741e6c..fe1613a258 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python2 -u from __future__ import absolute_import, division, print_function, unicode_literals import csv From 468a8f53309837b5d8abe94ad2926b743140289a Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 4 Mar 2016 07:40:52 -0500 Subject: [PATCH 16/63] Fix bug in skipping and use immutable ID as able --- match-stripe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index fe1613a258..7a37d56b84 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -81,7 +81,7 @@ def process_month(db, year, month): header("FUZZING") for rec in inexact: - guess = fuz2mat.get(rec['Description']) + guess = fuz2mat.get(rec['Customer ID']) fuzzed = fuzz(log, db, rec) possible = [m for m in fuzzed if not m.user_id in matched] @@ -93,18 +93,18 @@ def process_month(db, year, month): print('???', rec['Amount'], rec['Created']) # should log "skipping" below elif npossible == 1: match = possible[0] - if rec['Description'] in fuz2mat: + if rec['Customer ID'] in fuz2mat: print('(again) ', end='') else: - fuz2mat[rec['Description']] = match - else: + fuz2mat[rec['Customer ID']] = match + elif guess: match = {m.participant:m for m in possible}.get(guess.participant) - if not match: - print(' OR '.join([p.participant for p in possible])) if match: print(match.participant) rec2mat[rec['id']] = match + else: + print(' OR '.join([p.participant for p in possible])) header("WRITING") for rec in ordered: From 675df14447f30f6eded87ec4e0653680b7803718 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 4 Mar 2016 07:59:03 -0500 Subject: [PATCH 17/63] Tweak match criteria based on experience --- match-stripe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index 7a37d56b84..7ab476aa72 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -16,15 +16,16 @@ ON e.participant = p.username WHERE ( ((("timestamp" - %(Created)s) < '0 seconds') AND - (("timestamp" - %(Created)s) > '-61 seconds')) + (("timestamp" - %(Created)s) > '-62 seconds')) OR (("timestamp" - %(Created)s) = '0 seconds') OR ((("timestamp" - %(Created)s) > '0 seconds') AND - (("timestamp" - %(Created)s) < '61 seconds')) + (("timestamp" - %(Created)s) < '62 seconds')) ) AND amount + fee = %(Amount)s AND amount > 0 + AND recorder IS NULL -- filter out PayPal """ FIND = FUZZ + """ From 30c02beb2ef6033d23616793653a419abfcd4917 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 4 Mar 2016 08:00:01 -0500 Subject: [PATCH 18/63] Tighten up linking based on Customer ID --- match-stripe.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index 7ab476aa72..c27fe5f677 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -36,7 +36,7 @@ def find(log, db, rec): - log("finding", rec['Description']) + log("finding", rec['Description'], end=' => ') return db.one(FIND, rec) @@ -50,42 +50,51 @@ def process_month(db, year, month): writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w+')) headers = next(reader) - matched = [] rec2mat = {} inexact = [] ordered = [] - fuz2mat = {} + cid2mat = {} + uid2cid = {} header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) - log = lambda *a, **kw: print('{}-{}'.format(year, month), *a, **kw) header("FINDING") for row in reader: rec = dict(zip(headers, row)) rec[b'Created'] = rec.pop('Created (UTC)') # to make SQL interpolation easier + log = lambda *a, **kw: print(rec['Created'], *a, **kw) + if rec['id'] == 'ch_Pi3yBdmevsIr5q': continue # special-case the first test transaction - if rec['Status'] == 'Failed': - print("{} is a FAILURE!!!!!!!".format(rec['Description'])) + if rec['Status'] != 'Paid': + log("{Description} is {Status}!!!!!!!".format(**rec)) continue # right? ordered.append(rec) match = find(log, db, rec) if match: - matched.append(match.user_id) + uid = match.user_id + known = uid2cid.get(uid) + if known: + assert rec['Customer ID'] == known, (rec, match) + else: + cid2mat[rec['Customer ID']] = match + uid2cid[uid] = rec['Customer ID'] rec2mat[rec['id']] = match + print('yes') else: inexact.append(rec) + print('no') header("FUZZING") for rec in inexact: - guess = fuz2mat.get(rec['Customer ID']) + guess = cid2mat.get(rec['Customer ID']) fuzzed = fuzz(log, db, rec) - possible = [m for m in fuzzed if not m.user_id in matched] + possible = [m for m in fuzzed if not m.user_id in uid2cid] npossible = len(possible) print(' => ', end='') @@ -94,10 +103,10 @@ def process_month(db, year, month): print('???', rec['Amount'], rec['Created']) # should log "skipping" below elif npossible == 1: match = possible[0] - if rec['Customer ID'] in fuz2mat: + if rec['Customer ID'] in cid2mat: print('(again) ', end='') else: - fuz2mat[rec['Customer ID']] = match + cid2mat[rec['Customer ID']] = match elif guess: match = {m.participant:m for m in possible}.get(guess.participant) @@ -111,7 +120,7 @@ def process_month(db, year, month): for rec in ordered: match = rec2mat.get(rec['id']) if match is None: - log("skipping", rec['Description']) + log("skipping", rec['Description'], rec['id']) else: writer.writerow([ match.participant , match.user_id From 94f1ec61bd06a48868ccca6e043a1a29dad0a984 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 4 Mar 2016 08:08:30 -0500 Subject: [PATCH 19/63] Blip logging change --- match-stripe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/match-stripe.py b/match-stripe.py index c27fe5f677..363336b50d 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -100,7 +100,7 @@ def process_month(db, year, month): match = None if npossible == 0: - print('???', rec['Amount'], rec['Created']) # should log "skipping" below + print('???', rec['Amount'], end='') # should log "skipping" below elif npossible == 1: match = possible[0] if rec['Customer ID'] in cid2mat: From 213f65c8d63c29b90b10c23a63d4cb68976d3a20 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 9 Mar 2016 13:30:28 -0500 Subject: [PATCH 20/63] Broaden time horizon more Based on experience, makes sense due to minute granularity at Stripe. --- match-stripe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index 363336b50d..67a85ada79 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -15,13 +15,13 @@ JOIN participants p ON e.participant = p.username WHERE ( - ((("timestamp" - %(Created)s) < '0 seconds') AND - (("timestamp" - %(Created)s) > '-62 seconds')) + ((("timestamp" - %(Created)s) < '0 minutes') AND + (("timestamp" - %(Created)s) > '-2 minutes')) OR - (("timestamp" - %(Created)s) = '0 seconds') + (("timestamp" - %(Created)s) = '0 minutes') OR - ((("timestamp" - %(Created)s) > '0 seconds') AND - (("timestamp" - %(Created)s) < '62 seconds')) + ((("timestamp" - %(Created)s) > '0 minutes') AND + (("timestamp" - %(Created)s) < '2 minutes')) ) AND amount + fee = %(Amount)s AND amount > 0 @@ -81,8 +81,8 @@ def process_month(db, year, month): if known: assert rec['Customer ID'] == known, (rec, match) else: - cid2mat[rec['Customer ID']] = match uid2cid[uid] = rec['Customer ID'] + cid2mat[rec['Customer ID']] = match rec2mat[rec['id']] = match print('yes') else: @@ -120,7 +120,7 @@ def process_month(db, year, month): for rec in ordered: match = rec2mat.get(rec['id']) if match is None: - log("skipping", rec['Description'], rec['id']) + log("skipping", rec['Description'], rec['Customer ID'], rec['id']) else: writer.writerow([ match.participant , match.user_id From 1aed5e9df688fcdfb6f6585a79993759ee6d504b Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 9 Mar 2016 14:06:45 -0500 Subject: [PATCH 21/63] Filter out Balanced exchanges --- match-stripe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/match-stripe.py b/match-stripe.py index 67a85ada79..242180bf41 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -26,6 +26,7 @@ AND amount + fee = %(Amount)s AND amount > 0 AND recorder IS NULL -- filter out PayPal + AND route IS NULL -- filter out Balanced """ FIND = FUZZ + """ From 383a3bd5140a24ed883d03bfc10e84ca81d2760f Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 9 Mar 2016 14:44:46 -0500 Subject: [PATCH 22/63] Map Stripe status to our status --- match-stripe.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index 242180bf41..2c167248a2 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -69,9 +69,14 @@ def process_month(db, year, month): if rec['id'] == 'ch_Pi3yBdmevsIr5q': continue # special-case the first test transaction - if rec['Status'] != 'Paid': - log("{Description} is {Status}!!!!!!!".format(**rec)) - continue # right? + # translate status to our nomenclature + if rec['Status'] == 'Paid': + rec['Status'] = 'succeeded' + elif rec['Status'] == 'Failed': + rec['Status'] = 'failed' + continue # we'll deal with this next + else: + raise heck ordered.append(rec) From 27be8cf5cc4a2967af9a66392119ab35b23d7cf7 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 9 Mar 2016 15:39:50 -0500 Subject: [PATCH 23/63] Remember matches across months --- match-stripe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index 2c167248a2..d7c7358499 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -46,7 +46,7 @@ def fuzz(log, db, rec): return db.all(FUZZ, rec) -def process_month(db, year, month): +def process_month(db, cid2mat, uid2cid, year, month): reader = csv.reader(open('3912/{}/{}/_stripe-payments.csv'.format(year, month))) writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w+')) @@ -54,8 +54,6 @@ def process_month(db, year, month): rec2mat = {} inexact = [] ordered = [] - cid2mat = {} - uid2cid = {} header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) @@ -138,12 +136,14 @@ def process_month(db, year, month): def main(db, constraint): + cid2mat = {} + uid2cid = {} for year in os.listdir('3912'): if not year.isdigit(): continue for month in os.listdir('3912/' + year): if not month.isdigit(): continue if constraint and not '{}-{}'.format(year, month) == constraint: continue - process_month(db, year, month) + process_month(db, cid2mat, uid2cid, year, month) if __name__ == '__main__': From 020580fad0bc9beae688df0e21615296a00a90cf Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 9 Mar 2016 15:51:50 -0500 Subject: [PATCH 24/63] Output failed exchanges as well as successful ones --- match-stripe.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/match-stripe.py b/match-stripe.py index d7c7358499..d32f16862e 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -8,7 +8,7 @@ from gratipay import wireup -FUZZ = """ +FUZZ = """\ SELECT e.*, p.id as user_id FROM exchanges e @@ -29,12 +29,21 @@ AND route IS NULL -- filter out Balanced """ -FIND = FUZZ + """ +FIND = FUZZ + """\ AND participant = %(Description)s """ +HAIL_MARY = """\ + + SELECT username AS participant + , id AS user_id + FROM participants + WHERE username=%(Description)s + +""" + def find(log, db, rec): log("finding", rec['Description'], end=' => ') @@ -46,6 +55,11 @@ def fuzz(log, db, rec): return db.all(FUZZ, rec) +def hail_mary(log, db, rec): + log("full of grace", rec['Description']) + return db.one(HAIL_MARY, rec) + + def process_month(db, cid2mat, uid2cid, year, month): reader = csv.reader(open('3912/{}/{}/_stripe-payments.csv'.format(year, month))) writer = csv.writer(open('3912/{}/{}/stripe'.format(year, month), 'w+')) @@ -67,6 +81,8 @@ def process_month(db, cid2mat, uid2cid, year, month): if rec['id'] == 'ch_Pi3yBdmevsIr5q': continue # special-case the first test transaction + ordered.append(rec) + # translate status to our nomenclature if rec['Status'] == 'Paid': rec['Status'] = 'succeeded' @@ -76,8 +92,6 @@ def process_month(db, cid2mat, uid2cid, year, month): else: raise heck - ordered.append(rec) - match = find(log, db, rec) if match: uid = match.user_id @@ -124,7 +138,17 @@ def process_month(db, cid2mat, uid2cid, year, month): for rec in ordered: match = rec2mat.get(rec['id']) if match is None: - log("skipping", rec['Description'], rec['Customer ID'], rec['id']) + assert rec['Status'] == 'failed' + match = cid2mat.get(rec['Customer ID']) # *any* successful exchanges for this user? + if not match: + match = hail_mary(log, db, rec) + writer.writerow([ match.participant + , match.user_id + , rec['Customer ID'] + , '' # signal to backfill.py to INSERT a new exchange record + , rec['id'] + , rec['Status'] + ]) else: writer.writerow([ match.participant , match.user_id From d468b549bd7d445f614f95077c65cab9e2c48ab6 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 9 Mar 2016 17:03:07 -0500 Subject: [PATCH 25/63] Enable inserting new exchanges (not just updating) --- backfill.py | 77 ++++++++++++++++++++++++++++++++++++++++--------- match-stripe.py | 6 +++- 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/backfill.py b/backfill.py index 6e81526947..ed4cea65a4 100755 --- a/backfill.py +++ b/backfill.py @@ -11,7 +11,8 @@ username ignored user_id required Gratipay participant.id address optional defaults to 'fake-deadbeef' - exchange_id required Gratipay exchanges.id + exchange_id optional Gratipay exchanges.id; required for status == 'succeeded' + amount optional transaction amount; required if exchange_id is empty ref optional defaults to 'fake-beeffeed' status required Gratipay exchanges.status: succeeded, failed, pending @@ -40,29 +41,77 @@ def fake(*a): return 'fake-' + sha.new(''.join(map(str, a))).hexdigest() -def link(db, log, network, user_id, address, exchange_id, ref, status): +def _load_stuff(db, user_id, network, address): participant = Participant.from_id(user_id) route = ExchangeRoute.from_network(participant, network) if route is None: route = ExchangeRoute.insert(participant, network, address) - db.run( "UPDATE exchanges SET status=%s, route=%s, ref=%s WHERE id=%s" - , (status, route.id, ref, exchange_id) - ) - log(network, participant.username, participant.id, address, exchange_id, ref, status) + return participant, route + + +def link(db, log, network, user_id, address, exchange_id, _, __, ref, status): + participant, route = _load_stuff(db, user_id, network, address) + SQL = "UPDATE exchanges SET status=%s, route=%s, ref=%s WHERE id=%s" + db.run(SQL, (status, route.id, ref, exchange_id)) + log(network, participant.username, participant.id, address, exchange_id, _, __, ref, status) + + +def make(db, log, network, user_id, address, _, timestamp, amount, ref, status): + participant, route = _load_stuff(db, user_id, network, address) + + SQL = """\ + + INSERT INTO exchanges + ("timestamp", amount, fee, participant, recorder, note, status, route, ref) + VALUES (%(timestamp)s, %(amount)s, %(fee)s, %(username)s, %(recorder)s, %(note)s, + %(status)s, %(route)s, %(ref)s) + RETURNING id + + """ + + params = dict( timestamp=timestamp + , amount=amount + , fee=0 + , username=participant.username + , recorder='Gratipay' + , note='https://github.com/gratipay/gratipay.com/pull/3912' + , status=status + , route=route.id + , ref=ref + ) + + exchange_id = db.one(SQL, params) + log(network, participant.username, participant.id, address, exchange_id, timestamp, amount, + ref, status) + + +def process_row(network, _, user_id, address, exchange_id, timestamp, amount, ref, status): + assert user_id + address = address or fake(network, user_id) + assert status + + if status == 'succeeded': + if network in ('cash', 'samurai'): + assert ref == '' + ref = None + else: + assert ref + func = link + elif status == 'failed': + assert ref + func = make + else: + raise heck + + func(db, log, network, user_id, address, exchange_id, timestamp, amount, ref, status) def main(db, log): for network in os.listdir('backfill'): if network.startswith('_'): continue data = csv.reader(open(path.join('backfill', network))) - for _, user_id, address, exchange_id, ref, status in data: - assert user_id - address = address or fake(network, user_id) - assert exchange_id - ref = ref or fake(network, user_id, exchange_id) - assert status - - link(db, log, network, user_id, address, exchange_id, ref, status) + for row in data: + process_row(network, *row) if __name__ == '__main__': diff --git a/match-stripe.py b/match-stripe.py index d32f16862e..5e8628bd30 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -145,7 +145,9 @@ def process_month(db, cid2mat, uid2cid, year, month): writer.writerow([ match.participant , match.user_id , rec['Customer ID'] - , '' # signal to backfill.py to INSERT a new exchange record + , '' + , rec['Created'] + , rec['Amount'] , rec['id'] , rec['Status'] ]) @@ -154,6 +156,8 @@ def process_month(db, cid2mat, uid2cid, year, month): , match.user_id , rec['Customer ID'] , match.id + , '' + , '' , rec['id'] , rec['Status'] ]) From 64495f4a50a0231e77b979fcf8381d079b5b7e61 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 18 Mar 2016 07:43:33 -0400 Subject: [PATCH 26/63] Start a match-balanced.py script Derives from match-stripe.py. --- match-balanced.py | 218 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100755 match-balanced.py diff --git a/match-balanced.py b/match-balanced.py new file mode 100755 index 0000000000..671e5f30c4 --- /dev/null +++ b/match-balanced.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python2 -u +from __future__ import absolute_import, division, print_function, unicode_literals + +import csv +import os +import sys +from os import path + +from gratipay import wireup + + +FUZZ = """\ + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE ( + ((("timestamp" - %(created_at)s) < '0 minutes') AND + (("timestamp" - %(created_at)s) > '-2 minutes')) + OR + (("timestamp" - %(created_at)s) = '0 minutes') + OR + ((("timestamp" - %(created_at)s) > '0 minutes') AND + (("timestamp" - %(created_at)s) < '2 minutes')) + ) + AND amount + fee = %(amount)s + AND amount > 0 + AND recorder IS NULL -- filter out PayPal + +""" +FIND = FUZZ + """\ + + AND participant = %(description)s + +""" + +HAIL_MARY = """\ + + SELECT username AS participant + , id AS user_id + FROM participants + WHERE username=%(description)s + +""" + + +def find(log, db, rec): + log("finding", rec['description'], end=' => ') + return db.one(FIND, rec) + + +def fuzz(log, db, rec): + log("fuzzing", rec['description'], end='') + return db.all(FUZZ, rec) + + +def hail_mary(log, db, rec): + log("full of grace", rec['description']) + return db.one(HAIL_MARY, rec) + + +def process_month(db, cid2mat, uid2cid, year, month): + if not path.isfile(path.join('3912', year, month, '_balanced.csv')): return + reader = csv.reader(open(path.join('3912', year, month, '_balanced.csv'))) + writer = csv.writer(open(path.join('3912', year, month, 'balanced.csv'), 'w+')) + + headers = next(reader) + rec2mat = {} + inexact = [] + ordered = [] + + header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) + + header("FINDING") + for row in reader: + rec = dict(zip(headers, row)) + rec = dict({unicode(k):v for k,v in dict(rec).items()}) + + # convert cents to dollars + rec['amount'] = '{}.{}'.format(rec['amount'][:-2], rec['amount'][-2:]) + if rec['amount'].startswith('.'): + rec['amount'] = '0' + rec['amount'] + + log = lambda *a, **kw: print(rec['created_at'], *a, **kw) + + ordered.append(rec) + + # translate status to our nomenclature + if rec['status'] in ('succeeded', 'failed'): + pass # we'll deal with this next + else: + raise heck + + if rec['kind'] == 'card_hold': + continue + if rec['kind'] == 'credit': + rec['amount'] = '-' + rec['amount'] + + match = find(log, db, rec) + if match and match.route is not None: + assert match.ref == rec['id'] + assert match.status is not None + assert match.route is not None + ordered.pop() + print('all set!') + elif match: + uid = match.user_id + known = uid2cid.get(uid) + if known: + assert rec['links_customer'] == known, (rec, match) + else: + uid2cid[uid] = rec['links_customer'] + cid2mat[rec['links_customer']] = match + rec2mat[rec['id']] = match + print('yes') + else: + inexact.append(rec) + print('no') + + header("FUZZING") + for rec in inexact: + guess = cid2mat.get(rec['links_customer']) + + fuzzed = fuzz(log, db, rec) + possible = [m for m in fuzzed if not m.user_id in uid2cid] + npossible = len(possible) + print(' => ', end='') + + match = None + if npossible == 0: + print('???', rec['amount'], end='') # should log "skipping" below + elif npossible == 1: + match = possible[0] + if rec['links_customer'] in cid2mat: + print('(again) ', end='') + else: + cid2mat[rec['links_customer']] = match + elif guess: + match = {m.participant:m for m in possible}.get(guess.participant) + + if match: + print(match.participant) + rec2mat[rec['id']] = match + else: + print(' OR '.join([p.participant for p in possible])) + + header("WRITING") + for rec in ordered: + match = rec2mat.get(rec['id']) + if match is None: + assert rec['status'] == 'failed' + match = cid2mat.get(rec['links_customer']) # *any* successful exchanges for this user? + if not match: + match = hail_mary(log, db, rec) + writer.writerow([ match.participant + , match.user_id + , rec['Customer ID'] + , '' + , rec['Created'] + , rec['Amount'] + , rec['id'] + , rec['Status'] + ]) + else: + writer.writerow([ match.participant + , match.user_id + , rec['Customer ID'] + , match.id + , '' + , '' + , rec['id'] + , rec['Status'] + ]) + + +def main(db, constraint): + cid2mat = {} + uid2cid = {} + for year in os.listdir('3912'): + if not year.isdigit(): continue + for month in os.listdir('3912/' + year): + if not month.isdigit(): continue + if constraint and not '{}-{}'.format(year, month) == constraint: continue + process_month(db, cid2mat, uid2cid, year, month) + + +if __name__ == '__main__': + db = wireup.db(wireup.env()) + constraint = '' if len(sys.argv) < 2 else sys.argv[1] + main(db, constraint) + + +""" +Fields in balanced.dat: + + id + kind + meta_state + meta_participant_id + transaction_number + status + created_at + updated_at + failure_reason_code + currency + voided_at + href + amount + description + expires_at + failure_reason + meta_exchange_id + appears_on_statement_as + meta_balanced.result.trace_number + meta_balanced.result.return_reason_code + +""" From 4adfe9113f838ee3d5914222aa2ad59831f7ae2b Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 21 Mar 2016 18:26:33 -0400 Subject: [PATCH 27/63] Resolve ambiguity w/ Balanced w/ closest timestamp --- match-balanced.py | 89 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 671e5f30c4..af50a5531e 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import csv +import datetime import os import sys from os import path @@ -61,9 +62,11 @@ def hail_mary(log, db, rec): def process_month(db, cid2mat, uid2cid, year, month): - if not path.isfile(path.join('3912', year, month, '_balanced.csv')): return - reader = csv.reader(open(path.join('3912', year, month, '_balanced.csv'))) - writer = csv.writer(open(path.join('3912', year, month, 'balanced.csv'), 'w+')) + input_csv = path.join('3912', year, month, '_balanced.csv') + match_csv = path.join('3912', year, month, 'balanced') + if not path.isfile(input_csv): return + reader = csv.reader(open(input_csv)) + writer = csv.writer(open(match_csv, 'w+')) headers = next(reader) rec2mat = {} @@ -77,6 +80,9 @@ def process_month(db, cid2mat, uid2cid, year, month): rec = dict(zip(headers, row)) rec = dict({unicode(k):v for k,v in dict(rec).items()}) + if rec['id'] in ('WD7qFYL9rqIrCUmbXsgJJ8HT', 'WD16Zqy9ISWN5muEhXo19vpn'): + continue # special-case the first test transactions + # convert cents to dollars rec['amount'] = '{}.{}'.format(rec['amount'][:-2], rec['amount'][-2:]) if rec['amount'].startswith('.'): @@ -84,8 +90,6 @@ def process_month(db, cid2mat, uid2cid, year, month): log = lambda *a, **kw: print(rec['created_at'], *a, **kw) - ordered.append(rec) - # translate status to our nomenclature if rec['status'] in ('succeeded', 'failed'): pass # we'll deal with this next @@ -94,33 +98,42 @@ def process_month(db, cid2mat, uid2cid, year, month): if rec['kind'] == 'card_hold': continue + if rec['kind'] == 'credit': rec['amount'] = '-' + rec['amount'] + elif rec['kind'] == 'debit': + pass + else: + raise heck + + cid = rec['links__customer'] + ordered.append(rec) match = find(log, db, rec) - if match and match.route is not None: - assert match.ref == rec['id'] - assert match.status is not None - assert match.route is not None - ordered.pop() - print('all set!') - elif match: + if match: uid = match.user_id known = uid2cid.get(uid) if known: - assert rec['links_customer'] == known, (rec, match) + assert cid == known, (rec, match) else: - uid2cid[uid] = rec['links_customer'] - cid2mat[rec['links_customer']] = match + uid2cid[uid] = cid + cid2mat[cid] = match rec2mat[rec['id']] = match - print('yes') + + if match.route is not None: + assert match.ref == rec['id'] + assert match.status is not None + ordered.pop() + print('all set!') + else: + print('yes') else: inexact.append(rec) print('no') header("FUZZING") for rec in inexact: - guess = cid2mat.get(rec['links_customer']) + guess = cid2mat.get(cid) fuzzed = fuzz(log, db, rec) possible = [m for m in fuzzed if not m.user_id in uid2cid] @@ -132,45 +145,63 @@ def process_month(db, cid2mat, uid2cid, year, month): print('???', rec['amount'], end='') # should log "skipping" below elif npossible == 1: match = possible[0] - if rec['links_customer'] in cid2mat: + if cid in cid2mat: print('(again) ', end='') else: - cid2mat[rec['links_customer']] = match + cid2mat[cid] = match elif guess: match = {m.participant:m for m in possible}.get(guess.participant) if match: print(match.participant) - rec2mat[rec['id']] = match else: - print(' OR '.join([p.participant for p in possible])) + mindelta = None + + date, time = rec['created_at'].split('T') + Y,M,D = date.split('-') + h,m,s = time.split(':') + s,ms = s.split('.') + ms = ms[:-1] + Y,M,D,h,m,s,ms = [int(x) for x in (Y,M,D,h,m,s,ms)] + ts_balanced = datetime.datetime(Y,M,D,h,m,s,ms, possible[0].timestamp.tzinfo) + + for p in possible: + delta = abs(ts_balanced - p.timestamp) + if mindelta is None or (delta < mindelta): + mindelta = delta + match = p + + possible.remove(match) + print(match.participant, 'INSTEAD OF', ' OR '.join([p.participant for p in possible])) + + rec2mat[rec['id']] = match header("WRITING") for rec in ordered: match = rec2mat.get(rec['id']) if match is None: - assert rec['status'] == 'failed' - match = cid2mat.get(rec['links_customer']) # *any* successful exchanges for this user? + assert rec['status'] == 'failed', rec['id'] + match = cid2mat.get(cid) # *any* successful exchanges for this user? if not match: match = hail_mary(log, db, rec) writer.writerow([ match.participant , match.user_id - , rec['Customer ID'] + , rec['links__customer'] , '' - , rec['Created'] - , rec['Amount'] + , rec['created_at'] + , rec['amount'] , rec['id'] - , rec['Status'] + , rec['status'] ]) else: writer.writerow([ match.participant , match.user_id - , rec['Customer ID'] + , rec['links__customer'] , match.id , '' , '' , rec['id'] - , rec['Status'] + , rec['status'] ]) From 2fafc3d70b04f44f7d00a790546a8bd9acb9c362 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 24 Mar 2016 14:00:54 -0400 Subject: [PATCH 28/63] Broaden time range to pick up anomalies --- match-balanced.py | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index af50a5531e..6e11835bb9 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -18,15 +18,18 @@ ON e.participant = p.username WHERE ( ((("timestamp" - %(created_at)s) < '0 minutes') AND - (("timestamp" - %(created_at)s) > '-2 minutes')) + (("timestamp" - %(created_at)s) > '-7 minutes')) OR (("timestamp" - %(created_at)s) = '0 minutes') OR ((("timestamp" - %(created_at)s) > '0 minutes') AND - (("timestamp" - %(created_at)s) < '2 minutes')) + (("timestamp" - %(created_at)s) < '7 minutes')) + ) + AND ( + ((amount > 0) AND (amount + fee = %(amount)s)) + OR + ((amount < 0) AND (amount = %(amount)s)) ) - AND amount + fee = %(amount)s - AND amount > 0 AND recorder IS NULL -- filter out PayPal """ @@ -78,33 +81,32 @@ def process_month(db, cid2mat, uid2cid, year, month): header("FINDING") for row in reader: rec = dict(zip(headers, row)) - rec = dict({unicode(k):v for k,v in dict(rec).items()}) + #rec = dict({unicode(k):v for k,v in dict(rec).items()}) + # special-case the first test transactions if rec['id'] in ('WD7qFYL9rqIrCUmbXsgJJ8HT', 'WD16Zqy9ISWN5muEhXo19vpn'): - continue # special-case the first test transactions + continue # convert cents to dollars rec['amount'] = '{}.{}'.format(rec['amount'][:-2], rec['amount'][-2:]) if rec['amount'].startswith('.'): rec['amount'] = '0' + rec['amount'] - log = lambda *a, **kw: print(rec['created_at'], *a, **kw) - - # translate status to our nomenclature - if rec['status'] in ('succeeded', 'failed'): - pass # we'll deal with this next - else: - raise heck + # check status + if not rec['status'] in ('succeeded', 'failed'): + raise Exception(rec) + # check kind if rec['kind'] == 'card_hold': - continue - - if rec['kind'] == 'credit': + continue # we never tracked these in the Gratipay db + elif rec['kind'] in ('credit', 'refund'): rec['amount'] = '-' + rec['amount'] - elif rec['kind'] == 'debit': + elif rec['kind'] in ('debit', 'reversal'): pass else: - raise heck + raise Exception(rec) + + log = lambda *a, **kw: print(rec['created_at'], *a, **kw) cid = rec['links__customer'] ordered.append(rec) @@ -133,10 +135,12 @@ def process_month(db, cid2mat, uid2cid, year, month): header("FUZZING") for rec in inexact: + cid = rec['links__customer'] guess = cid2mat.get(cid) fuzzed = fuzz(log, db, rec) - possible = [m for m in fuzzed if not m.user_id in uid2cid] + keep = lambda m: (not m.user_id in uid2cid) or (guess and m.user_id == guess.user_id) + possible = [m for m in fuzzed if keep(m)] npossible = len(possible) print(' => ', end='') @@ -150,10 +154,14 @@ def process_month(db, cid2mat, uid2cid, year, month): else: cid2mat[cid] = match elif guess: + print('(guessing) ', end='') match = {m.participant:m for m in possible}.get(guess.participant) if match: print(match.participant) + elif not possible: + print(' ... IMPOSSIBLE!!!!!!!!!!!') + continue else: mindelta = None From 94b322c7f7e420ac3886ae5ab18dc5d7e87ba349 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 25 Mar 2016 07:58:33 -0400 Subject: [PATCH 29/63] Simplify time range check This may shoot us in the foot. --- match-balanced.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 6e11835bb9..1c6cfd1fa5 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -17,13 +17,9 @@ JOIN participants p ON e.participant = p.username WHERE ( - ((("timestamp" - %(created_at)s) < '0 minutes') AND - (("timestamp" - %(created_at)s) > '-7 minutes')) - OR - (("timestamp" - %(created_at)s) = '0 minutes') - OR - ((("timestamp" - %(created_at)s) > '0 minutes') AND - (("timestamp" - %(created_at)s) < '7 minutes')) + (("timestamp" - %(created_at)s) >= '0 minutes') + AND + (("timestamp" - %(created_at)s) < '7 minutes') ) AND ( ((amount > 0) AND (amount + fee = %(amount)s)) From 98e661b125a49e2baef5d6f293d3d6e707038515 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 25 Mar 2016 07:58:58 -0400 Subject: [PATCH 30/63] Keep going even if we can't find a match --- match-balanced.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/match-balanced.py b/match-balanced.py index 1c6cfd1fa5..55e5adc8b8 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -71,6 +71,7 @@ def process_month(db, cid2mat, uid2cid, year, month): rec2mat = {} inexact = [] ordered = [] + failed = set() header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) @@ -157,6 +158,7 @@ def process_month(db, cid2mat, uid2cid, year, month): print(match.participant) elif not possible: print(' ... IMPOSSIBLE!!!!!!!!!!!') + failed.add(rec['id']) continue else: mindelta = None @@ -182,6 +184,7 @@ def process_month(db, cid2mat, uid2cid, year, month): header("WRITING") for rec in ordered: + if rec['id'] in failed: continue match = rec2mat.get(rec['id']) if match is None: assert rec['status'] == 'failed', rec['id'] From 95162c2eaaf4b50b486a45e469addd9555428d9b Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 25 Mar 2016 08:08:12 -0400 Subject: [PATCH 31/63] Satisfy pyflakes (fix Travis) --- backfill.py | 2 +- match-stripe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backfill.py b/backfill.py index ed4cea65a4..eb8012be69 100755 --- a/backfill.py +++ b/backfill.py @@ -101,7 +101,7 @@ def process_row(network, _, user_id, address, exchange_id, timestamp, amount, re assert ref func = make else: - raise heck + assert 0, locals() func(db, log, network, user_id, address, exchange_id, timestamp, amount, ref, status) diff --git a/match-stripe.py b/match-stripe.py index 5e8628bd30..f2ed25b684 100755 --- a/match-stripe.py +++ b/match-stripe.py @@ -90,7 +90,7 @@ def process_month(db, cid2mat, uid2cid, year, month): rec['Status'] = 'failed' continue # we'll deal with this next else: - raise heck + assert 0, locals() match = find(log, db, rec) if match: From 72ce8e69d6e44b6b0119cb9c35321b297faefd83 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 28 Mar 2016 08:45:00 -0400 Subject: [PATCH 32/63] Better handle mismatch cases --- match-balanced.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 55e5adc8b8..64e0d9d289 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -120,10 +120,22 @@ def process_month(db, cid2mat, uid2cid, year, month): rec2mat[rec['id']] = match if match.route is not None: - assert match.ref == rec['id'] - assert match.status is not None - ordered.pop() - print('all set!') + if match.ref is None and match.status is None: + print('missing ref and status!') + elif match.ref != rec['id'] and match.status != rec['status']: + print('mismatched ref and status!') + elif match.ref is None: + print('missing ref!') + elif match.ref != rec['id']: + print('mismatched ref!') + elif match.status is None: + print('missing status!') + elif match.status != rec['status']: + print('mismatched status!') + + else: + ordered.pop() + print('all set!') else: print('yes') else: From 76f11497e5be66bf178949766c4819415504b188 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 30 Mar 2016 15:03:53 -0400 Subject: [PATCH 33/63] Bring over the tally script from #3807 --- tally-backfill.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100755 tally-backfill.py diff --git a/tally-backfill.py b/tally-backfill.py new file mode 100755 index 0000000000..deb25f04d7 --- /dev/null +++ b/tally-backfill.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +from __future__ import absolute_import, division, print_function, unicode_literals + +import commands + + +def title(title): + print(title) + print("-"*34) + +def report(*patterns): + N = 0 + for pattern in patterns: + n = int(commands.getoutput('grep "{}$" backfill.log | wc -l'.format(pattern))) + N += n + print("{:<28} {:>5}".format(pattern, n)) + print("{:<28} {:>5}".format('', N)) + + +report( 'IMPOSSIBLE!!!!!!!!!!!' + , 'all set!' + , 'yes' + , 'no' + , 'missing ref and status!' + , 'mismatched ref and status!' + , 'missing ref!' + , 'mismatched ref!' + , 'missing status!' + , 'mismatched status!' + ) From 3d7d7b1234881970b39bc8284eb84c8eea21d85f Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 31 Mar 2016 07:52:42 -0400 Subject: [PATCH 34/63] Wrap db access into a class I want to change the db call pattern as an optimization. This sets us up for that. I checked the output on 2013-07 and it's the same. --- match-balanced.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 64e0d9d289..0faf0e067a 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -45,22 +45,25 @@ """ -def find(log, db, rec): - log("finding", rec['description'], end=' => ') - return db.one(FIND, rec) +class Matcher(object): + def __init__(self, db): + self.db = db -def fuzz(log, db, rec): - log("fuzzing", rec['description'], end='') - return db.all(FUZZ, rec) + def find(self, log, rec): + log("finding", rec['description'], end=' => ') + return self.db.one(FIND, rec) + def fuzz(self, log, rec): + log("fuzzing", rec['description'], end='') + return self.db.all(FUZZ, rec) -def hail_mary(log, db, rec): - log("full of grace", rec['description']) - return db.one(HAIL_MARY, rec) + def hail_mary(self, log, rec): + log("full of grace", rec['description']) + return self.db.one(HAIL_MARY, rec) -def process_month(db, cid2mat, uid2cid, year, month): +def process_month(matcher, cid2mat, uid2cid, year, month): input_csv = path.join('3912', year, month, '_balanced.csv') match_csv = path.join('3912', year, month, 'balanced') if not path.isfile(input_csv): return @@ -108,7 +111,7 @@ def process_month(db, cid2mat, uid2cid, year, month): cid = rec['links__customer'] ordered.append(rec) - match = find(log, db, rec) + match = matcher.find(log, rec) if match: uid = match.user_id known = uid2cid.get(uid) @@ -147,7 +150,7 @@ def process_month(db, cid2mat, uid2cid, year, month): cid = rec['links__customer'] guess = cid2mat.get(cid) - fuzzed = fuzz(log, db, rec) + fuzzed = matcher.fuzz(log, rec) keep = lambda m: (not m.user_id in uid2cid) or (guess and m.user_id == guess.user_id) possible = [m for m in fuzzed if keep(m)] npossible = len(possible) @@ -202,7 +205,7 @@ def process_month(db, cid2mat, uid2cid, year, month): assert rec['status'] == 'failed', rec['id'] match = cid2mat.get(cid) # *any* successful exchanges for this user? if not match: - match = hail_mary(log, db, rec) + match = matcher.hail_mary(log, rec) writer.writerow([ match.participant , match.user_id , rec['links__customer'] @@ -224,7 +227,7 @@ def process_month(db, cid2mat, uid2cid, year, month): ]) -def main(db, constraint): +def main(matcher, constraint): cid2mat = {} uid2cid = {} for year in os.listdir('3912'): @@ -232,13 +235,14 @@ def main(db, constraint): for month in os.listdir('3912/' + year): if not month.isdigit(): continue if constraint and not '{}-{}'.format(year, month) == constraint: continue - process_month(db, cid2mat, uid2cid, year, month) + process_month(matcher, cid2mat, uid2cid, year, month) if __name__ == '__main__': - db = wireup.db(wireup.env()) - constraint = '' if len(sys.argv) < 2 else sys.argv[1] - main(db, constraint) + _db = wireup.db(wireup.env()) + _matcher = Matcher(_db) + _constraint = '' if len(sys.argv) < 2 else sys.argv[1] + main(_matcher, _constraint) """ From 89223912fbce54f3afa686c8fdfdaf0b0f958c01 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 31 Mar 2016 08:12:55 -0400 Subject: [PATCH 35/63] Refactor hail mary for a single db call --- match-balanced.py | 14 +++++++++++--- tally-backfill.py | 23 ++++++++++++----------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 0faf0e067a..6547c99e60 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -35,12 +35,11 @@ """ -HAIL_MARY = """\ +STUBS = """\ SELECT username AS participant , id AS user_id FROM participants - WHERE username=%(description)s """ @@ -50,6 +49,12 @@ class Matcher(object): def __init__(self, db): self.db = db + def load_month(self, year, month): + pass + + def load_stubs(self): + self.username2stub = self.db.all(STUBS) + def find(self, log, rec): log("finding", rec['description'], end=' => ') return self.db.one(FIND, rec) @@ -59,8 +64,9 @@ def fuzz(self, log, rec): return self.db.all(FUZZ, rec) def hail_mary(self, log, rec): + # XXX I'm not sure this is ever hit! log("full of grace", rec['description']) - return self.db.one(HAIL_MARY, rec) + return self.username2stub.get(rec['description']) def process_month(matcher, cid2mat, uid2cid, year, month): @@ -70,6 +76,8 @@ def process_month(matcher, cid2mat, uid2cid, year, month): reader = csv.reader(open(input_csv)) writer = csv.writer(open(match_csv, 'w+')) + matcher.load_month(year, month) + headers = next(reader) rec2mat = {} inexact = [] diff --git a/tally-backfill.py b/tally-backfill.py index deb25f04d7..1260e75d7f 100755 --- a/tally-backfill.py +++ b/tally-backfill.py @@ -11,20 +11,21 @@ def title(title): def report(*patterns): N = 0 for pattern in patterns: - n = int(commands.getoutput('grep "{}$" backfill.log | wc -l'.format(pattern))) + n = int(commands.getoutput('grep "{}" backfill.log | wc -l'.format(pattern))) N += n print("{:<28} {:>5}".format(pattern, n)) print("{:<28} {:>5}".format('', N)) -report( 'IMPOSSIBLE!!!!!!!!!!!' - , 'all set!' - , 'yes' - , 'no' - , 'missing ref and status!' - , 'mismatched ref and status!' - , 'missing ref!' - , 'mismatched ref!' - , 'missing status!' - , 'mismatched status!' +report( 'IMPOSSIBLE!!!!!!!!!!!$' + , 'all set!$' + , 'yes$' + , 'no$' + , 'missing ref and status!$' + , 'mismatched ref and status!$' + , 'missing ref!$' + , 'mismatched ref!$' + , 'missing status!$' + , 'mismatched status!$' + , 'full of grace' ) From ae3e61fdf6ce961f198fa55bf6015a27c318c939 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 31 Mar 2016 21:10:37 -0400 Subject: [PATCH 36/63] Start optimizing find; there's a bug! --- match-balanced.py | 52 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 6547c99e60..8a8699a325 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -5,8 +5,10 @@ import datetime import os import sys +from decimal import Decimal as D from os import path +import psycopg2.tz from gratipay import wireup @@ -35,6 +37,18 @@ """ +FULL = """\ + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE substr("timestamp"::text, 0, 8) = %s + AND recorder IS NULL -- filter out PayPal + ORDER BY "timestamp" asc + +""" + STUBS = """\ SELECT username AS participant @@ -44,20 +58,53 @@ """ +def datetime_from_iso(iso): + date, time = iso.split('T') + assert time[-1] == 'Z' + time = time[:-1] + year, month, day = map(int, date.split('-')) + hour, minute, second_microsecond = time.split(':') + hour, minute = map(int, (hour, minute)) + second, microsecond = map(int, second_microsecond.split('.')) + tz = psycopg2.tz.FixedOffsetTimezone(offset=0, name=None) + return datetime.datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tz) + + class Matcher(object): def __init__(self, db): self.db = db def load_month(self, year, month): - pass + self.exchanges = self.db.all(FULL, ('{}-{}'.format(year, month),)) def load_stubs(self): self.username2stub = self.db.all(STUBS) def find(self, log, rec): log("finding", rec['description'], end=' => ') - return self.db.one(FIND, rec) + for i in range(len(self.exchanges)): + e = self.exchanges[i] + + # check username + if e.participant != rec['description']: + continue + + # check amount + amount = D(rec['amount']) + if (e.amount > 0) and (e.amount + e.fee != amount): + continue + if (e.amount < 0) and (e.amount != amount): + continue + + # check timestamp + delta = e.timestamp - datetime_from_iso(rec['created_at']) + threshold = datetime.timedelta(minutes=2) + if delta < threshold: + self.exchanges.pop(i) + return e # got one! + + return None def fuzz(self, log, rec): log("fuzzing", rec['description'], end='') @@ -139,6 +186,7 @@ def process_month(matcher, cid2mat, uid2cid, year, month): print('missing ref!') elif match.ref != rec['id']: print('mismatched ref!') + import pdb; pdb.set_trace() elif match.status is None: print('missing status!') elif match.status != rec['status']: From 0aeda86fb2d3be3ba9bd8e2ad890168ae2002acf Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 4 Apr 2016 07:35:51 -0400 Subject: [PATCH 37/63] Squish a bug --- match-balanced.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 8a8699a325..d58b766838 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -98,7 +98,13 @@ def find(self, log, rec): continue # check timestamp - delta = e.timestamp - datetime_from_iso(rec['created_at']) + timestamp = datetime_from_iso(rec['created_at']) + if e.timestamp < timestamp: + # the Balanced record always precedes the db record + continue + + # keep checking timestamp + delta = e.timestamp - timestamp threshold = datetime.timedelta(minutes=2) if delta < threshold: self.exchanges.pop(i) @@ -186,12 +192,10 @@ def process_month(matcher, cid2mat, uid2cid, year, month): print('missing ref!') elif match.ref != rec['id']: print('mismatched ref!') - import pdb; pdb.set_trace() elif match.status is None: print('missing status!') elif match.status != rec['status']: print('mismatched status!') - else: ordered.pop() print('all set!') From 5c5853d2a89a0abe53caa8ade7e6c5c329f1611d Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 4 Apr 2016 07:36:55 -0400 Subject: [PATCH 38/63] Deprecate hail_mary --- match-balanced.py | 1 + 1 file changed, 1 insertion(+) diff --git a/match-balanced.py b/match-balanced.py index d58b766838..17633c2d06 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -118,6 +118,7 @@ def fuzz(self, log, rec): def hail_mary(self, log, rec): # XXX I'm not sure this is ever hit! + raise NotImplementedError # let's find out log("full of grace", rec['description']) return self.username2stub.get(rec['description']) From 65f58e28fdce9e49d517433308f77681122f189d Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 4 Apr 2016 15:01:32 -0400 Subject: [PATCH 39/63] Start implementing fuzz using in-mem struct --- match-balanced.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 17633c2d06..cf9fffab20 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -83,12 +83,24 @@ def load_stubs(self): def find(self, log, rec): log("finding", rec['description'], end=' => ') - for i in range(len(self.exchanges)): + found = self._find(log, rec, relaxed=False) + return found[0] if found else None + + def fuzz(self, log, rec): + log("fuzzing", rec['description'], end='') + return self._find(log, rec, relaxed=True) + + def _find(self, log, rec, relaxed): + found = [] + i = 0 + while i < len(self.exchanges): e = self.exchanges[i] + i += 1 # check username - if e.participant != rec['description']: - continue + if not relaxed: + if e.participant != rec['description']: + continue # check amount amount = D(rec['amount']) @@ -105,16 +117,19 @@ def find(self, log, rec): # keep checking timestamp delta = e.timestamp - timestamp - threshold = datetime.timedelta(minutes=2) - if delta < threshold: + threshold = datetime.timedelta(minutes=7) + if delta > threshold: + break + + if not found: self.exchanges.pop(i) - return e # got one! + i -= 1 + found.append(e) - return None + if not relaxed: + break - def fuzz(self, log, rec): - log("fuzzing", rec['description'], end='') - return self.db.all(FUZZ, rec) + return found def hail_mary(self, log, rec): # XXX I'm not sure this is ever hit! From 92920dc4cda53f1a5720a21161cf67b482c2e52f Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Tue, 5 Apr 2016 17:51:59 -0400 Subject: [PATCH 40/63] Fix a bug and try to fix another --- match-balanced.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index cf9fffab20..d34cd0c2cc 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -88,7 +88,9 @@ def find(self, log, rec): def fuzz(self, log, rec): log("fuzzing", rec['description'], end='') - return self._find(log, rec, relaxed=True) + fuzzed = self._find(log, rec, relaxed=True) + fuzzed.sort(key=lambda x: x.id) + return fuzzed def _find(self, log, rec, relaxed): found = [] @@ -122,8 +124,8 @@ def _find(self, log, rec, relaxed): break if not found: - self.exchanges.pop(i) i -= 1 + self.exchanges.pop(i) found.append(e) if not relaxed: From 4100f4e74997d70e119adc45c196253eed41e963 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 11 Apr 2016 18:10:08 -0400 Subject: [PATCH 41/63] Add an arg to tally-backfill.py: filename --- tally-backfill.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tally-backfill.py b/tally-backfill.py index 1260e75d7f..e224998d81 100755 --- a/tally-backfill.py +++ b/tally-backfill.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import absolute_import, division, print_function, unicode_literals +import sys import commands @@ -8,16 +9,17 @@ def title(title): print(title) print("-"*34) -def report(*patterns): +def report(filename, *patterns): N = 0 for pattern in patterns: - n = int(commands.getoutput('grep "{}" backfill.log | wc -l'.format(pattern))) + n = int(commands.getoutput('grep "{}" {} | wc -l'.format(pattern, filename))) N += n print("{:<28} {:>5}".format(pattern, n)) print("{:<28} {:>5}".format('', N)) -report( 'IMPOSSIBLE!!!!!!!!!!!$' +report( sys.argv[1] if len(sys.argv) > 1 else 'backfill.log' + , 'IMPOSSIBLE!!!!!!!!!!!$' , 'all set!$' , 'yes$' , 'no$' From efaf0d3b42cd5888a2528e4c752e8ecbdee1b915 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 15 Apr 2016 08:02:00 -0400 Subject: [PATCH 42/63] Enable running *through* a month --- match-balanced.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/match-balanced.py b/match-balanced.py index d34cd0c2cc..13e509fd73 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -3,6 +3,7 @@ import csv import datetime +import operator import os import sys from decimal import Decimal as D @@ -308,11 +309,17 @@ def process_month(matcher, cid2mat, uid2cid, year, month): def main(matcher, constraint): cid2mat = {} uid2cid = {} + + op = operator.eq + if constraint[0] == '_': + constraint = constraint[1:] + op = operator.le + for year in os.listdir('3912'): if not year.isdigit(): continue for month in os.listdir('3912/' + year): if not month.isdigit(): continue - if constraint and not '{}-{}'.format(year, month) == constraint: continue + if constraint and not op('{}-{}'.format(year, month), constraint): continue process_month(matcher, cid2mat, uid2cid, year, month) From e6d11f9e7ca6242ae612edc34112b4eaccb38700 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 15 Apr 2016 08:02:17 -0400 Subject: [PATCH 43/63] Fix a bug ... uncover another? --- match-balanced.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/match-balanced.py b/match-balanced.py index 13e509fd73..eaf003fa15 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -247,6 +247,7 @@ def process_month(matcher, cid2mat, uid2cid, year, month): elif guess: print('(guessing) ', end='') match = {m.participant:m for m in possible}.get(guess.participant) + # XXX Why would we ever have a matched guess *and* "INSTEAD OF"? if match: print(match.participant) @@ -271,6 +272,7 @@ def process_month(matcher, cid2mat, uid2cid, year, month): mindelta = delta match = p + cid2mat[cid] = match possible.remove(match) print(match.participant, 'INSTEAD OF', ' OR '.join([p.participant for p in possible])) From 2e98494dcb57d9ff49dd1a4658cface32694d476 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 15 Apr 2016 15:28:34 -0400 Subject: [PATCH 44/63] Because matches are exchanges, not participants --- match-balanced.py | 1 - 1 file changed, 1 deletion(-) diff --git a/match-balanced.py b/match-balanced.py index eaf003fa15..7201044245 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -247,7 +247,6 @@ def process_month(matcher, cid2mat, uid2cid, year, month): elif guess: print('(guessing) ', end='') match = {m.participant:m for m in possible}.get(guess.participant) - # XXX Why would we ever have a matched guess *and* "INSTEAD OF"? if match: print(match.participant) From b20864645289c63ddaf5d2a2c3a44572f713e6bd Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 15 Apr 2016 15:29:50 -0400 Subject: [PATCH 45/63] Prune dead code --- match-balanced.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 7201044245..ff376b4910 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -13,31 +13,6 @@ from gratipay import wireup -FUZZ = """\ - - SELECT e.*, p.id as user_id - FROM exchanges e - JOIN participants p - ON e.participant = p.username - WHERE ( - (("timestamp" - %(created_at)s) >= '0 minutes') - AND - (("timestamp" - %(created_at)s) < '7 minutes') - ) - AND ( - ((amount > 0) AND (amount + fee = %(amount)s)) - OR - ((amount < 0) AND (amount = %(amount)s)) - ) - AND recorder IS NULL -- filter out PayPal - -""" -FIND = FUZZ + """\ - - AND participant = %(description)s - -""" - FULL = """\ SELECT e.*, p.id as user_id @@ -161,7 +136,6 @@ def process_month(matcher, cid2mat, uid2cid, year, month): header("FINDING") for row in reader: rec = dict(zip(headers, row)) - #rec = dict({unicode(k):v for k,v in dict(rec).items()}) # special-case the first test transactions if rec['id'] in ('WD7qFYL9rqIrCUmbXsgJJ8HT', 'WD16Zqy9ISWN5muEhXo19vpn'): From 906bfb86786bd541c2933dfcdc0c8a5a4190780b Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 15 Apr 2016 15:45:12 -0400 Subject: [PATCH 46/63] Fix bug in CLI constraint arg --- match-balanced.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/match-balanced.py b/match-balanced.py index ff376b4910..a945b9b4de 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -286,7 +286,7 @@ def main(matcher, constraint): uid2cid = {} op = operator.eq - if constraint[0] == '_': + if constraint and constraint[0] == '_': constraint = constraint[1:] op = operator.le From 0f319b610ce85e170591217d5b07064f530dc8c9 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Fri, 15 Apr 2016 15:50:49 -0400 Subject: [PATCH 47/63] Start moving structs onto Matcher --- match-balanced.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index a945b9b4de..7b39d3709e 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -50,6 +50,8 @@ class Matcher(object): def __init__(self, db): self.db = db + self.cid2mat = {} + self.uid2cid = {} def load_month(self, year, month): self.exchanges = self.db.all(FULL, ('{}-{}'.format(year, month),)) @@ -116,7 +118,7 @@ def hail_mary(self, log, rec): return self.username2stub.get(rec['description']) -def process_month(matcher, cid2mat, uid2cid, year, month): +def process_month(matcher, year, month): input_csv = path.join('3912', year, month, '_balanced.csv') match_csv = path.join('3912', year, month, 'balanced') if not path.isfile(input_csv): return @@ -168,12 +170,12 @@ def process_month(matcher, cid2mat, uid2cid, year, month): match = matcher.find(log, rec) if match: uid = match.user_id - known = uid2cid.get(uid) + known = matcher.uid2cid.get(uid) if known: assert cid == known, (rec, match) else: - uid2cid[uid] = cid - cid2mat[cid] = match + matcher.uid2cid[uid] = cid + matcher.cid2mat[cid] = match rec2mat[rec['id']] = match if match.route is not None: @@ -201,10 +203,10 @@ def process_month(matcher, cid2mat, uid2cid, year, month): header("FUZZING") for rec in inexact: cid = rec['links__customer'] - guess = cid2mat.get(cid) + guess = matcher.cid2mat.get(cid) fuzzed = matcher.fuzz(log, rec) - keep = lambda m: (not m.user_id in uid2cid) or (guess and m.user_id == guess.user_id) + keep = lambda m: (not m.user_id in matcher.uid2cid) or (guess and m.user_id == guess.user_id) possible = [m for m in fuzzed if keep(m)] npossible = len(possible) print(' => ', end='') @@ -214,10 +216,10 @@ def process_month(matcher, cid2mat, uid2cid, year, month): print('???', rec['amount'], end='') # should log "skipping" below elif npossible == 1: match = possible[0] - if cid in cid2mat: + if cid in matcher.cid2mat: print('(again) ', end='') else: - cid2mat[cid] = match + matcher.cid2mat[cid] = match elif guess: print('(guessing) ', end='') match = {m.participant:m for m in possible}.get(guess.participant) @@ -245,7 +247,7 @@ def process_month(matcher, cid2mat, uid2cid, year, month): mindelta = delta match = p - cid2mat[cid] = match + matcher.cid2mat[cid] = match possible.remove(match) print(match.participant, 'INSTEAD OF', ' OR '.join([p.participant for p in possible])) @@ -257,7 +259,7 @@ def process_month(matcher, cid2mat, uid2cid, year, month): match = rec2mat.get(rec['id']) if match is None: assert rec['status'] == 'failed', rec['id'] - match = cid2mat.get(cid) # *any* successful exchanges for this user? + match = matcher.cid2mat.get(cid) # *any* successful exchanges for this user? if not match: match = matcher.hail_mary(log, rec) writer.writerow([ match.participant @@ -282,9 +284,6 @@ def process_month(matcher, cid2mat, uid2cid, year, month): def main(matcher, constraint): - cid2mat = {} - uid2cid = {} - op = operator.eq if constraint and constraint[0] == '_': constraint = constraint[1:] @@ -295,7 +294,7 @@ def main(matcher, constraint): for month in os.listdir('3912/' + year): if not month.isdigit(): continue if constraint and not op('{}-{}'.format(year, month), constraint): continue - process_month(matcher, cid2mat, uid2cid, year, month) + process_month(matcher, year, month) if __name__ == '__main__': From ca680eec8e067e60d287f5d6660d0a448e8a63ca Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 18 Apr 2016 07:43:16 -0400 Subject: [PATCH 48/63] Factor out a little function --- match-balanced.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 7b39d3709e..3918369f51 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -13,6 +13,9 @@ from gratipay import wireup +header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) + + FULL = """\ SELECT e.*, p.id as user_id @@ -133,8 +136,6 @@ def process_month(matcher, year, month): ordered = [] failed = set() - header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) - header("FINDING") for row in reader: rec = dict(zip(headers, row)) From 303daa48d2a8640e37270d968095ff5c225c0b15 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 18 Apr 2016 07:43:27 -0400 Subject: [PATCH 49/63] Prune dead code --- match-balanced.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 3918369f51..170ac06beb 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -114,12 +114,6 @@ def _find(self, log, rec, relaxed): return found - def hail_mary(self, log, rec): - # XXX I'm not sure this is ever hit! - raise NotImplementedError # let's find out - log("full of grace", rec['description']) - return self.username2stub.get(rec['description']) - def process_month(matcher, year, month): input_csv = path.join('3912', year, month, '_balanced.csv') @@ -261,8 +255,7 @@ def process_month(matcher, year, month): if match is None: assert rec['status'] == 'failed', rec['id'] match = matcher.cid2mat.get(cid) # *any* successful exchanges for this user? - if not match: - match = matcher.hail_mary(log, rec) + assert match is not None writer.writerow([ match.participant , match.user_id , rec['links__customer'] From c235c3e4386506ad5c45a3491a8efb2b0efb1282 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Tue, 19 Apr 2016 08:49:04 -0400 Subject: [PATCH 50/63] Start simplifying algorithm, to build back up --- match-balanced.py | 84 +++++++++++++++++++++++------------------------ tally-backfill.py | 2 +- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 170ac06beb..927f8335c0 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -53,7 +53,7 @@ class Matcher(object): def __init__(self, db): self.db = db - self.cid2mat = {} + self.cid2uid = {} self.uid2cid = {} def load_month(self, year, month): @@ -62,38 +62,43 @@ def load_month(self, year, month): def load_stubs(self): self.username2stub = self.db.all(STUBS) - def find(self, log, rec): - log("finding", rec['description'], end=' => ') - found = self._find(log, rec, relaxed=False) + def find(self, log, timestamp, amount, username): + log("finding", username, end=' => ') + found = self._find(log, timestamp, amount, uid=None, username=username) return found[0] if found else None - def fuzz(self, log, rec): - log("fuzzing", rec['description'], end='') - fuzzed = self._find(log, rec, relaxed=True) + def fuzz(self, log, timestamp, amount, uid, username): + log("fuzzing", username, end='') + fuzzed = self._find(log, timestamp, amount, uid=uid, username=username) fuzzed.sort(key=lambda x: x.id) return fuzzed - def _find(self, log, rec, relaxed): + def _find(self, log, timestamp, amount, uid, username): found = [] i = 0 + timestamp = datetime_from_iso(timestamp) while i < len(self.exchanges): e = self.exchanges[i] i += 1 + # check uid + if uid and e.user_id != uid: + continue + if uid is None and e.user_id in self.uid2cid: + continue + # check username - if not relaxed: - if e.participant != rec['description']: - continue + if username and e.participant != username: + continue # check amount - amount = D(rec['amount']) + amount = D(amount) if (e.amount > 0) and (e.amount + e.fee != amount): continue if (e.amount < 0) and (e.amount != amount): continue # check timestamp - timestamp = datetime_from_iso(rec['created_at']) if e.timestamp < timestamp: # the Balanced record always precedes the db record continue @@ -109,7 +114,7 @@ def _find(self, log, rec, relaxed): self.exchanges.pop(i) found.append(e) - if not relaxed: + if uid: break return found @@ -162,7 +167,7 @@ def process_month(matcher, year, month): cid = rec['links__customer'] ordered.append(rec) - match = matcher.find(log, rec) + match = matcher.find(log, rec['created_at'], rec['amount'], rec['description']) if match: uid = match.user_id known = matcher.uid2cid.get(uid) @@ -170,7 +175,7 @@ def process_month(matcher, year, month): assert cid == known, (rec, match) else: matcher.uid2cid[uid] = cid - matcher.cid2mat[cid] = match + matcher.cid2uid[cid] = uid rec2mat[rec['id']] = match if match.route is not None: @@ -198,33 +203,27 @@ def process_month(matcher, year, month): header("FUZZING") for rec in inexact: cid = rec['links__customer'] - guess = matcher.cid2mat.get(cid) - - fuzzed = matcher.fuzz(log, rec) - keep = lambda m: (not m.user_id in matcher.uid2cid) or (guess and m.user_id == guess.user_id) - possible = [m for m in fuzzed if keep(m)] + rid = rec['id'] + guess = matcher.cid2uid.get(cid) + possible = matcher.fuzz(log, rec['created_at'], rec['amount'], guess, rec['description']) npossible = len(possible) + + def fail(msg): + print(msg) + failed.add(rid) + print(' => ', end='') - match = None - if npossible == 0: - print('???', rec['amount'], end='') # should log "skipping" below - elif npossible == 1: - match = possible[0] - if cid in matcher.cid2mat: - print('(again) ', end='') + if guess: + if npossible == 0: + fail('Eep! Guess failed!') + elif npossible > 1: + fail('What?! Too many!') else: - matcher.cid2mat[cid] = match - elif guess: - print('(guessing) ', end='') - match = {m.participant:m for m in possible}.get(guess.participant) - - if match: - print(match.participant) + match = possible[0] + print(match.participant) elif not possible: - print(' ... IMPOSSIBLE!!!!!!!!!!!') - failed.add(rec['id']) - continue + fail(' ... IMPOSSIBLE!!!!!!!!!!!') else: mindelta = None @@ -242,11 +241,14 @@ def process_month(matcher, year, month): mindelta = delta match = p - matcher.cid2mat[cid] = match + matcher.cid2uid[cid] = match.user_id possible.remove(match) print(match.participant, 'INSTEAD OF', ' OR '.join([p.participant for p in possible])) - rec2mat[rec['id']] = match + if rid in failed: + continue + + rec2mat[rid] = match header("WRITING") for rec in ordered: @@ -254,8 +256,6 @@ def process_month(matcher, year, month): match = rec2mat.get(rec['id']) if match is None: assert rec['status'] == 'failed', rec['id'] - match = matcher.cid2mat.get(cid) # *any* successful exchanges for this user? - assert match is not None writer.writerow([ match.participant , match.user_id , rec['links__customer'] diff --git a/tally-backfill.py b/tally-backfill.py index e224998d81..abaf9b35c4 100755 --- a/tally-backfill.py +++ b/tally-backfill.py @@ -20,6 +20,7 @@ def report(filename, *patterns): report( sys.argv[1] if len(sys.argv) > 1 else 'backfill.log' , 'IMPOSSIBLE!!!!!!!!!!!$' + , 'Eep! Guess failed!$' , 'all set!$' , 'yes$' , 'no$' @@ -29,5 +30,4 @@ def report(filename, *patterns): , 'mismatched ref!$' , 'missing status!$' , 'mismatched status!$' - , 'full of grace' ) From 0b2c0b92bd193e8e2136e2ef5abf2216f432c090 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 20 Apr 2016 12:16:34 -0400 Subject: [PATCH 51/63] Prune more dead code --- match-balanced.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 927f8335c0..0907f60377 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -59,9 +59,6 @@ def __init__(self, db): def load_month(self, year, month): self.exchanges = self.db.all(FULL, ('{}-{}'.format(year, month),)) - def load_stubs(self): - self.username2stub = self.db.all(STUBS) - def find(self, log, timestamp, amount, username): log("finding", username, end=' => ') found = self._find(log, timestamp, amount, uid=None, username=username) From c2cf716f571f2c1442fca68a574dae167dc81ec1 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 20 Apr 2016 13:12:18 -0400 Subject: [PATCH 52/63] Start inverting loops on Balanced script We want to find all months first, then fuzz all months, etc., not find and fuzz one month and then the next. --- match-balanced.py | 150 ++++++++++++++++++++++++++++++---------------- 1 file changed, 100 insertions(+), 50 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 0907f60377..4f7040f8c3 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -28,14 +28,6 @@ """ -STUBS = """\ - - SELECT username AS participant - , id AS user_id - FROM participants - -""" - def datetime_from_iso(iso): date, time = iso.split('T') @@ -52,13 +44,9 @@ def datetime_from_iso(iso): class Matcher(object): def __init__(self, db): - self.db = db self.cid2uid = {} self.uid2cid = {} - def load_month(self, year, month): - self.exchanges = self.db.all(FULL, ('{}-{}'.format(year, month),)) - def find(self, log, timestamp, amount, username): log("finding", username, end=' => ') found = self._find(log, timestamp, amount, uid=None, username=username) @@ -136,29 +124,6 @@ def process_month(matcher, year, month): for row in reader: rec = dict(zip(headers, row)) - # special-case the first test transactions - if rec['id'] in ('WD7qFYL9rqIrCUmbXsgJJ8HT', 'WD16Zqy9ISWN5muEhXo19vpn'): - continue - - # convert cents to dollars - rec['amount'] = '{}.{}'.format(rec['amount'][:-2], rec['amount'][-2:]) - if rec['amount'].startswith('.'): - rec['amount'] = '0' + rec['amount'] - - # check status - if not rec['status'] in ('succeeded', 'failed'): - raise Exception(rec) - - # check kind - if rec['kind'] == 'card_hold': - continue # we never tracked these in the Gratipay db - elif rec['kind'] in ('credit', 'refund'): - rec['amount'] = '-' + rec['amount'] - elif rec['kind'] in ('debit', 'reversal'): - pass - else: - raise Exception(rec) - log = lambda *a, **kw: print(rec['created_at'], *a, **kw) cid = rec['links__customer'] @@ -274,29 +239,114 @@ def fail(msg): ]) -def main(matcher, constraint): - op = operator.eq - if constraint and constraint[0] == '_': - constraint = constraint[1:] - op = operator.le +### OLD ^^^^^^^^^^^^^^^^^^ +### NEW vvvvvvvvvvvvvvvvvv + + +def get_exchanges(db): + return db.all("""\ + + SELECT e.*, p.id as user_id + FROM exchanges e + JOIN participants p + ON e.participant = p.username + WHERE recorder IS NULL -- filter out PayPal + ORDER BY "timestamp" asc + + """) + + +def get_transactions(root): + transactions = [] + for dirpath, dirnames, filenames in os.walk(root): + for filename in filenames: + if filename != '_balanced.csv': + continue + fp = open(os.path.join(dirpath, filename)) + reader = csv.reader(fp) + headers = reader.next() + for row in reader: + rec = dict(zip(headers, row)) + + # special-case the first test transactions + if rec['id'] in ('WD7qFYL9rqIrCUmbXsgJJ8HT', 'WD16Zqy9ISWN5muEhXo19vpn'): + continue + + # special-case escrow shuffles to/from Gratipay + if rec['links__customer'] == 'AC13kr5rmbUkMJWbocmNs3tD': + continue + + # convert cents to dollars + rec['amount'] = '{}.{}'.format(rec['amount'][:-2], rec['amount'][-2:]) + if rec['amount'].startswith('.'): + rec['amount'] = '0' + rec['amount'] + + # check status + if not rec['status'] in ('succeeded', 'failed'): + raise Exception(rec) + + # check kind + if rec['kind'] == 'card_hold': + continue # we never tracked these in the Gratipay db + elif rec['kind'] in ('credit', 'refund'): + rec['amount'] = '-' + rec['amount'] + elif rec['kind'] in ('debit', 'reversal'): + pass + else: + raise Exception(rec) + + transactions.append(rec) + + # may not be necessary, but just to be sure ... + transactions.sort(key=lambda rec: rec['created_at']) + transactions.reverse() + + return transactions + + +def first_pass(gratipay_exchanges, balanced_transactions): + """Remove matches from _exchanges and _transactions and return a list of + (exchange, transaction) match tuples + """ + for exchange in gratipay_exchanges: + pass + + return [] + + +def main(db, root): + + # Load two lists, exchanges and transactions, both sorted by timestamp + # ascending. + + gratipay_exchanges = get_exchanges(db) + balanced_transactions = get_transactions(root) + + print("We have {} exchanges to match!".format(len(gratipay_exchanges))) + print("We have {} transactions to match!".format(len(balanced_transactions))) + + + # Loop through exchanges and match any transactions that we can do so + # definitively: username and amount match exactly, and timestamp is within + # NN seconds (XXX after, right?). + + passes = [first_pass] + + matches = [] + for pass_ in passes: + matches.extend(pass_(gratipay_exchanges, balanced_transactions)) - for year in os.listdir('3912'): - if not year.isdigit(): continue - for month in os.listdir('3912/' + year): - if not month.isdigit(): continue - if constraint and not op('{}-{}'.format(year, month), constraint): continue - process_month(matcher, year, month) + print("We found {} matches!".format(len(matches))) if __name__ == '__main__': _db = wireup.db(wireup.env()) - _matcher = Matcher(_db) - _constraint = '' if len(sys.argv) < 2 else sys.argv[1] - main(_matcher, _constraint) + _root = os.path.abspath('3912') + main(_db, _root) """ -Fields in balanced.dat: +Fields in balanced.csv: id kind From 7094568cbd7bd88c69d03481d6f6a0e27a5c0699 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 20 Apr 2016 13:38:31 -0400 Subject: [PATCH 53/63] Flesh out looping infrastructure --- match-balanced.py | 123 +++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 90 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 4f7040f8c3..4e09dccfb8 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -41,70 +41,6 @@ def datetime_from_iso(iso): return datetime.datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tz) -class Matcher(object): - - def __init__(self, db): - self.cid2uid = {} - self.uid2cid = {} - - def find(self, log, timestamp, amount, username): - log("finding", username, end=' => ') - found = self._find(log, timestamp, amount, uid=None, username=username) - return found[0] if found else None - - def fuzz(self, log, timestamp, amount, uid, username): - log("fuzzing", username, end='') - fuzzed = self._find(log, timestamp, amount, uid=uid, username=username) - fuzzed.sort(key=lambda x: x.id) - return fuzzed - - def _find(self, log, timestamp, amount, uid, username): - found = [] - i = 0 - timestamp = datetime_from_iso(timestamp) - while i < len(self.exchanges): - e = self.exchanges[i] - i += 1 - - # check uid - if uid and e.user_id != uid: - continue - if uid is None and e.user_id in self.uid2cid: - continue - - # check username - if username and e.participant != username: - continue - - # check amount - amount = D(amount) - if (e.amount > 0) and (e.amount + e.fee != amount): - continue - if (e.amount < 0) and (e.amount != amount): - continue - - # check timestamp - if e.timestamp < timestamp: - # the Balanced record always precedes the db record - continue - - # keep checking timestamp - delta = e.timestamp - timestamp - threshold = datetime.timedelta(minutes=7) - if delta > threshold: - break - - if not found: - i -= 1 - self.exchanges.pop(i) - found.append(e) - - if uid: - break - - return found - - def process_month(matcher, year, month): input_csv = path.join('3912', year, month, '_balanced.csv') match_csv = path.join('3912', year, month, 'balanced') @@ -299,50 +235,57 @@ def get_transactions(root): # may not be necessary, but just to be sure ... transactions.sort(key=lambda rec: rec['created_at']) - transactions.reverse() return transactions -def first_pass(gratipay_exchanges, balanced_transactions): - """Remove matches from _exchanges and _transactions and return a list of - (exchange, transaction) match tuples - """ - for exchange in gratipay_exchanges: - pass - - return [] - +class Matcher(object): -def main(db, root): + def __init__(self, db, root): + self.transactions = get_transactions(root) + self.exchanges = get_exchanges(db) - # Load two lists, exchanges and transactions, both sorted by timestamp - # ascending. + print("We have {} transactions to match!".format(len(self.transactions))) + print("We have {} exchanges to match!".format(len(self.exchanges))) - gratipay_exchanges = get_exchanges(db) - balanced_transactions = get_transactions(root) + self.cid2uid = {} + self.uid2cid = {} - print("We have {} exchanges to match!".format(len(gratipay_exchanges))) - print("We have {} transactions to match!".format(len(balanced_transactions))) + def main(self): + passes = [self.first_pass] + matches = [] + for pass_ in passes: + matches.extend(pass_()) + print("We found {} matches!".format(len(matches))) - # Loop through exchanges and match any transactions that we can do so - # definitively: username and amount match exactly, and timestamp is within - # NN seconds (XXX after, right?). - passes = [first_pass] + def loop_over_exchanges(self, start, seconds): + timestamp = datetime_from_iso(start) + limit = timestamp + datetime.timedelta(seconds=seconds) + for exchange in self.exchanges: + if exchange.timestamp > limit: + break + yield exchange - matches = [] - for pass_ in passes: - matches.extend(pass_(gratipay_exchanges, balanced_transactions)) - print("We found {} matches!".format(len(matches))) + def first_pass(self): + """Remove matches from _exchanges and _transactions and return a list of + (exchange, transaction) match tuples + """ + for i, transaction in enumerate(self.transactions): + if i % 1000 == 0: + print('.', end='') + for exchange in self.loop_over_exchanges(transaction['created_at'], 10): + continue + return [] if __name__ == '__main__': _db = wireup.db(wireup.env()) _root = os.path.abspath('3912') - main(_db, _root) + matcher = Matcher(_db, _root) + matcher.main() """ From 49cea72cd60926d8d914edee9cd9394530cf4a80 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Wed, 20 Apr 2016 14:43:30 -0400 Subject: [PATCH 54/63] We have output! --- match-balanced.py | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 4e09dccfb8..b16ea9fb94 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -248,16 +248,16 @@ def __init__(self, db, root): print("We have {} transactions to match!".format(len(self.transactions))) print("We have {} exchanges to match!".format(len(self.exchanges))) + self.matches = [] self.cid2uid = {} self.uid2cid = {} def main(self): passes = [self.first_pass] - matches = [] for pass_ in passes: - matches.extend(pass_()) - print("We found {} matches!".format(len(matches))) + pass_() + print("We found {} matches!".format(len(self.matches))) def loop_over_exchanges(self, start, seconds): @@ -277,15 +277,43 @@ def first_pass(self): if i % 1000 == 0: print('.', end='') for exchange in self.loop_over_exchanges(transaction['created_at'], 10): - continue - return [] + + # match amount + amount = D(transaction['amount']) + if (exchange.amount > 0) and (exchange.amount + exchange.fee != amount): + continue + if (exchange.amount < 0) and (exchange.amount != amount): + continue + + # match username + if transaction['description'] != exchange.participant: + continue + + self.matches.append((transaction, exchange)) + + + def dump(self): + out = csv.writer(open('balanced', 'w+')) + for transaction, exchange in self.matches: + out.writerow(( exchange.participant + , exchange.user_id + , transaction['links__customer'] + , exchange.id + , exchange.amount + , transaction['id'] + , transaction['status'] + )) if __name__ == '__main__': _db = wireup.db(wireup.env()) _root = os.path.abspath('3912') matcher = Matcher(_db, _root) - matcher.main() + try: + matcher.main() + except KeyboardInterrupt: + pass + matcher.dump() """ From 2886da239f6221103212832ccd904dfdee631363 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 21 Apr 2016 15:53:28 -0400 Subject: [PATCH 55/63] Seems like our new script is kind of working? --- match-balanced.py | 277 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 237 insertions(+), 40 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index b16ea9fb94..09ae2b041c 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -3,9 +3,9 @@ import csv import datetime -import operator import os -import sys +import time +from collections import defaultdict from decimal import Decimal as D from os import path @@ -16,6 +16,9 @@ header = lambda h: print(h.upper() + ' ' + ((80 - len(h) - 1) * '-')) +class Heck(Exception): pass + + FULL = """\ SELECT e.*, p.id as user_id @@ -41,6 +44,27 @@ def datetime_from_iso(iso): return datetime.datetime(year, month, day, hour, minute, second, microsecond, tzinfo=tz) +def usernames_match(transaction, exchange): + if not exchange.participant: import pdb; pdb.set_trace() + return transaction['description'] == exchange.participant + + +def amounts_match(transaction, exchange): + amount = transaction['amount'] + if (exchange.amount > 0) and (exchange.amount + exchange.fee != amount): + return False + if (exchange.amount < 0) and (exchange.amount != amount): + return False + return True + + +def ts_within(transaction, exchange, seconds): + ts_transaction = transaction['timestamp'] + ts_exchange = exchange.timestamp + limit = ts_transaction + datetime.timedelta(seconds=seconds) + return ts_exchange <= limit + + def process_month(matcher, year, month): input_csv = path.join('3912', year, month, '_balanced.csv') match_csv = path.join('3912', year, month, 'balanced') @@ -194,6 +218,9 @@ def get_exchanges(db): def get_transactions(root): transactions = [] + card2usernames = defaultdict(set) + username2cids = defaultdict(set) + for dirpath, dirnames, filenames in os.walk(root): for filename in filenames: if filename != '_balanced.csv': @@ -201,21 +228,28 @@ def get_transactions(root): fp = open(os.path.join(dirpath, filename)) reader = csv.reader(fp) headers = reader.next() + for row in reader: rec = dict(zip(headers, row)) + username = rec['description'] + cid = rec['links__customer'] # special-case the first test transactions if rec['id'] in ('WD7qFYL9rqIrCUmbXsgJJ8HT', 'WD16Zqy9ISWN5muEhXo19vpn'): continue # special-case escrow shuffles to/from Gratipay - if rec['links__customer'] == 'AC13kr5rmbUkMJWbocmNs3tD': + if cid == 'AC13kr5rmbUkMJWbocmNs3tD': continue - # convert cents to dollars + # convert cents to decimal dollars rec['amount'] = '{}.{}'.format(rec['amount'][:-2], rec['amount'][-2:]) if rec['amount'].startswith('.'): rec['amount'] = '0' + rec['amount'] + rec['amount'] = D(rec['amount']) + + # convert created_at to timestamp + rec['timestamp'] = datetime_from_iso(rec['created_at']) # check status if not rec['status'] in ('succeeded', 'failed'): @@ -223,73 +257,223 @@ def get_transactions(root): # check kind if rec['kind'] == 'card_hold': - continue # we never tracked these in the Gratipay db + if rec['links__debit']: + + # Balanced has one or two transactions, card_hold and + # possibly debit, where we only have one exchange. We + # can skip card_holds where there are debits. + + continue + else: + + # If we are gonna run with a card_hold, we need to jump + # through hoops to be able to deference a cid from it + # later on. + + card = rec['links__card'] + if not card or not username: + import pdb; pdb.set_trace() + card2usernames[card].add(username) + elif rec['kind'] in ('credit', 'refund'): - rec['amount'] = '-' + rec['amount'] + rec['amount'] = -rec['amount'] elif rec['kind'] in ('debit', 'reversal'): pass else: raise Exception(rec) + # Map cid to usernames--more hoop-jumping + if username and cid: + username2cids[username].add(cid) + transactions.append(rec) # may not be necessary, but just to be sure ... transactions.sort(key=lambda rec: rec['created_at']) - return transactions + return transactions, card2usernames, username2cids class Matcher(object): def __init__(self, db, root): - self.transactions = get_transactions(root) + print("Loading transactions ... ", end='') + self.transactions, card2usernames, username2cids = get_transactions(root) + print("we have {} transactions to match!".format(len(self.transactions))) + + print("Loading exchanges ... ", end='') self.exchanges = get_exchanges(db) + print("we have {} exchanges to match!".format(len(self.exchanges))) + + self.uncategorized = {'transactions': [], 'exchanges': []} + + + # Do goofiness to map cid to transactions + + card2cid = {} + for t in self.transactions: + cid, card = t['links__customer'], t['links__card'] + if cid == '': + continue + card2cid[card] = cid + + self.cid2transactions = defaultdict(list) + for t in self.transactions: + cid = t['links__customer'] + + if t['status'] == 'failed' and t['created_at'] < '2014-12-18': + # We didn't record failures before this date. + self.uncategorized['transactions'].append(t) + continue + + if not cid: + if t['kind'] != 'card_hold' or t['links__debit'] != '': + self.uncategorized['transactions'].append(t) + continue + usernames = card2usernames[t['links__card']] + cids = set.union(*[username2cids[username] for username in usernames]) + if len(cids) != 1: + self.uncategorized['transactions'].append(t) + continue + cid = tuple(cids)[0] + + if not cid: + self.uncategorized['transactions'].append(t) + continue - print("We have {} transactions to match!".format(len(self.transactions))) - print("We have {} exchanges to match!".format(len(self.exchanges))) + self.cid2transactions[cid].append(t) + + # A little less goofiness to map uid to exchanges + self.uid2exchanges = defaultdict(list) + for e in self.exchanges: + self.uid2exchanges[e.user_id].append(e) self.matches = [] - self.cid2uid = {} - self.uid2cid = {} - def main(self): - passes = [self.first_pass] - for pass_ in passes: - pass_() - print("We found {} matches!".format(len(self.matches))) + def inner_loop(self, cid, uid): + + transactions = self.cid2transactions[cid] + exchanges = self.uid2exchanges[uid] + + + # Remove from global lists + # ======================== + # also decrement global indices if the transaction or exchange's + # timestamp is less than the one that landed us here + + for transaction in transactions: + if transaction['timestamp'] < self.transactions[self.I]['timestamp']: + self.I -= 1 + self.transactions.remove(transaction) + + for exchange in exchanges: + if exchange.timestamp < self.exchanges[self.J].timestamp: + self.J -= 1 + self.K -= 1 + self.exchanges.remove(exchange) + + # Match items in the local lists if we can. + # ========================================= - def loop_over_exchanges(self, start, seconds): - timestamp = datetime_from_iso(start) - limit = timestamp + datetime.timedelta(seconds=seconds) - for exchange in self.exchanges: - if exchange.timestamp > limit: - break - yield exchange + matched_t = set() + matched_e = set() + for t in transactions: + if t['id'] in matched_t: continue + for e in exchanges: + if e.id in matched_e: continue + if e.timestamp < t['timestamp']: continue + if amounts_match(t, e) and ts_within(t, e, 6*3600): + matched_t.add(t['id']) + matched_e.add(e.id) + self.matches.append((t, e)) + break - def first_pass(self): + self.uncategorized['transactions'] += [t for t in transactions if t['id'] not in matched_t] + self.uncategorized['exchanges'] += [e for e in exchanges if e.id not in matched_e] + + + def main(self): """Remove matches from _exchanges and _transactions and return a list of (exchange, transaction) match tuples """ - for i, transaction in enumerate(self.transactions): - if i % 1000 == 0: - print('.', end='') - for exchange in self.loop_over_exchanges(transaction['created_at'], 10): - - # match amount - amount = D(transaction['amount']) - if (exchange.amount > 0) and (exchange.amount + exchange.fee != amount): - continue - if (exchange.amount < 0) and (exchange.amount != amount): - continue + h = done = self.I = self.J = self.K = 0 + start = time.time() + while not done: + + # output a progress report + h += 1 + if h % 10 == 0: + N = len(self.transactions) + M = len(self.exchanges) + perc = self.I / N + remaining = int((time.time() - start) / (perc or 0.001)) + if remaining > 24*60*60: + remaining = '{:.1f} d'.format(remaining / 60 / 60 / 24) + elif remaining > 60*60: + remaining = '{:.1f} h'.format(remaining / 60 / 60) + elif remaining > 60: + remaining = '{:.1f} m'.format(remaining / 60) + else: + remaining = '{} s'.format(remaining) + print('\r{:>5} / {:>5} = {:4.1f}% | {:>5} / {:>5} = {:4.1f}% | {} matches | T-{}' + .format( self.I, N, perc * 100 + , self.J, M, (self.J / M) * 100 + , len(self.matches) + , remaining + ), end='') + + # Grab the next transaction and exchange. + transaction = self.transactions[self.I] + exchange = self.exchanges[self.J] + + # See if the two match. + if amounts_match(transaction, exchange) and usernames_match(transaction, exchange): + cid = transaction['links__customer'] + uid = exchange.user_id + self.inner_loop(cid, uid) + self.K = self.J + continue - # match username - if transaction['description'] != exchange.participant: - continue + # Advance the outer loop. + done = self.advance() + + + def advance(self): + """Return bool (whether to continue the outer loop). + """ + + try: # try incrementing J + I = self.I + J = self.J + 1 + + # Check for the end of the list. + if J > len(self.exchanges): + raise Heck + + # Check for 10+ seconds beyond the transaction. + if not ts_within(self.transactions[I], self.exchanges[J], 10): + raise Heck + + except Heck: # increment I instead - self.matches.append((transaction, exchange)) + I = self.I + 1 + J = self.K + + # Check for the end of the list. + if I > len(self.transactions): + return True + + # Reset J. + transaction = self.transactions[I] + while not ts_within(transaction, self.exchanges[J], 0): + J -= 1 + + self.I = I + self.J = J + return False def dump(self): @@ -304,15 +488,28 @@ def dump(self): , transaction['status'] )) + out = csv.writer(open('uncategorized.exchanges', 'w+')) + for exchange in self.uncategorized['exchanges']: + rec = [x[1] for x in exchange._asdict().items()] + out.writerow(rec) + + out = csv.writer(open('uncategorized.transactions', 'w+')) + for transaction in self.uncategorized['transactions']: + rec = [x[1] for x in sorted(transaction.items())] + out.writerow(rec) + if __name__ == '__main__': _db = wireup.db(wireup.env()) _root = os.path.abspath('3912') matcher = Matcher(_db, _root) + try: matcher.main() except KeyboardInterrupt: pass + + print("\nWe found {} matches!".format(len(matcher.matches))) matcher.dump() From 958e53cb8eb202a911696263cab4230d4ef4f683 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 21 Apr 2016 16:13:23 -0400 Subject: [PATCH 56/63] Show more accurate time remaining --- match-balanced.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 09ae2b041c..d05bc2326e 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -401,6 +401,7 @@ def main(self): """ h = done = self.I = self.J = self.K = 0 start = time.time() + N_initial = len(self.transactions) while not done: # output a progress report @@ -408,8 +409,12 @@ def main(self): if h % 10 == 0: N = len(self.transactions) M = len(self.exchanges) - perc = self.I / N - remaining = int((time.time() - start) / (perc or 0.001)) + perc = (N_initial - N) / N_initial + + elapsed = time.time() - start + total = elapsed / (perc or 0.001) + remaining = total - elapsed + if remaining > 24*60*60: remaining = '{:.1f} d'.format(remaining / 60 / 60 / 24) elif remaining > 60*60: @@ -418,10 +423,12 @@ def main(self): remaining = '{:.1f} m'.format(remaining / 60) else: remaining = '{} s'.format(remaining) - print('\r{:>5} / {:>5} = {:4.1f}% | {:>5} / {:>5} = {:4.1f}% | {} matches | T-{}' - .format( self.I, N, perc * 100 - , self.J, M, (self.J / M) * 100 + + print('\r{:>5} / {:>5} | {:>5} / {:>5} | {} matches | {:4.1f}% | T-{:<20}' + .format( self.I, N + , self.J, M , len(self.matches) + , perc * 100 , remaining ), end='') From 497252768acd4925b812a5e1cd8284157209a67d Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 21 Apr 2016 16:53:48 -0400 Subject: [PATCH 57/63] Fix off-by-one bug at the end of the script --- match-balanced.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index d05bc2326e..02dac63c87 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -422,7 +422,7 @@ def main(self): elif remaining > 60: remaining = '{:.1f} m'.format(remaining / 60) else: - remaining = '{} s'.format(remaining) + remaining = '{} s'.format(int(remaining)) print('\r{:>5} / {:>5} | {:>5} / {:>5} | {} matches | {:4.1f}% | T-{:<20}' .format( self.I, N @@ -457,7 +457,7 @@ def advance(self): J = self.J + 1 # Check for the end of the list. - if J > len(self.exchanges): + if J == len(self.exchanges): raise Heck # Check for 10+ seconds beyond the transaction. @@ -470,7 +470,7 @@ def advance(self): J = self.K # Check for the end of the list. - if I > len(self.transactions): + if I == len(self.transactions): return True # Reset J. @@ -513,7 +513,7 @@ def dump(self): try: matcher.main() - except KeyboardInterrupt: + except: pass print("\nWe found {} matches!".format(len(matcher.matches))) From 871a2968177159fa2cd5c770ae3cb42cfed6cc5b Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 21 Apr 2016 16:54:08 -0400 Subject: [PATCH 58/63] Kill the rest of the old code --- match-balanced.py | 176 ++++++---------------------------------------- 1 file changed, 20 insertions(+), 156 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 02dac63c87..30f6dc912e 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -7,7 +7,6 @@ import time from collections import defaultdict from decimal import Decimal as D -from os import path import psycopg2.tz from gratipay import wireup @@ -19,25 +18,12 @@ class Heck(Exception): pass -FULL = """\ - - SELECT e.*, p.id as user_id - FROM exchanges e - JOIN participants p - ON e.participant = p.username - WHERE substr("timestamp"::text, 0, 8) = %s - AND recorder IS NULL -- filter out PayPal - ORDER BY "timestamp" asc - -""" - - def datetime_from_iso(iso): - date, time = iso.split('T') - assert time[-1] == 'Z' - time = time[:-1] + date, time_ = iso.split('T') + assert time_[-1] == 'Z' + time_ = time_[:-1] year, month, day = map(int, date.split('-')) - hour, minute, second_microsecond = time.split(':') + hour, minute, second_microsecond = time_.split(':') hour, minute = map(int, (hour, minute)) second, microsecond = map(int, second_microsecond.split('.')) tz = psycopg2.tz.FixedOffsetTimezone(offset=0, name=None) @@ -65,144 +51,6 @@ def ts_within(transaction, exchange, seconds): return ts_exchange <= limit -def process_month(matcher, year, month): - input_csv = path.join('3912', year, month, '_balanced.csv') - match_csv = path.join('3912', year, month, 'balanced') - if not path.isfile(input_csv): return - reader = csv.reader(open(input_csv)) - writer = csv.writer(open(match_csv, 'w+')) - - matcher.load_month(year, month) - - headers = next(reader) - rec2mat = {} - inexact = [] - ordered = [] - failed = set() - - header("FINDING") - for row in reader: - rec = dict(zip(headers, row)) - - log = lambda *a, **kw: print(rec['created_at'], *a, **kw) - - cid = rec['links__customer'] - ordered.append(rec) - - match = matcher.find(log, rec['created_at'], rec['amount'], rec['description']) - if match: - uid = match.user_id - known = matcher.uid2cid.get(uid) - if known: - assert cid == known, (rec, match) - else: - matcher.uid2cid[uid] = cid - matcher.cid2uid[cid] = uid - rec2mat[rec['id']] = match - - if match.route is not None: - if match.ref is None and match.status is None: - print('missing ref and status!') - elif match.ref != rec['id'] and match.status != rec['status']: - print('mismatched ref and status!') - elif match.ref is None: - print('missing ref!') - elif match.ref != rec['id']: - print('mismatched ref!') - elif match.status is None: - print('missing status!') - elif match.status != rec['status']: - print('mismatched status!') - else: - ordered.pop() - print('all set!') - else: - print('yes') - else: - inexact.append(rec) - print('no') - - header("FUZZING") - for rec in inexact: - cid = rec['links__customer'] - rid = rec['id'] - guess = matcher.cid2uid.get(cid) - possible = matcher.fuzz(log, rec['created_at'], rec['amount'], guess, rec['description']) - npossible = len(possible) - - def fail(msg): - print(msg) - failed.add(rid) - - print(' => ', end='') - - if guess: - if npossible == 0: - fail('Eep! Guess failed!') - elif npossible > 1: - fail('What?! Too many!') - else: - match = possible[0] - print(match.participant) - elif not possible: - fail(' ... IMPOSSIBLE!!!!!!!!!!!') - else: - mindelta = None - - date, time = rec['created_at'].split('T') - Y,M,D = date.split('-') - h,m,s = time.split(':') - s,ms = s.split('.') - ms = ms[:-1] - Y,M,D,h,m,s,ms = [int(x) for x in (Y,M,D,h,m,s,ms)] - ts_balanced = datetime.datetime(Y,M,D,h,m,s,ms, possible[0].timestamp.tzinfo) - - for p in possible: - delta = abs(ts_balanced - p.timestamp) - if mindelta is None or (delta < mindelta): - mindelta = delta - match = p - - matcher.cid2uid[cid] = match.user_id - possible.remove(match) - print(match.participant, 'INSTEAD OF', ' OR '.join([p.participant for p in possible])) - - if rid in failed: - continue - - rec2mat[rid] = match - - header("WRITING") - for rec in ordered: - if rec['id'] in failed: continue - match = rec2mat.get(rec['id']) - if match is None: - assert rec['status'] == 'failed', rec['id'] - writer.writerow([ match.participant - , match.user_id - , rec['links__customer'] - , '' - , rec['created_at'] - , rec['amount'] - , rec['id'] - , rec['status'] - ]) - else: - writer.writerow([ match.participant - , match.user_id - , rec['links__customer'] - , match.id - , '' - , '' - , rec['id'] - , rec['status'] - ]) - - -### OLD ^^^^^^^^^^^^^^^^^^ -### NEW vvvvvvvvvvvvvvvvvv - - def get_exchanges(db): return db.all("""\ @@ -389,6 +237,22 @@ def inner_loop(self, cid, uid): matched_t.add(t['id']) matched_e.add(e.id) self.matches.append((t, e)) + + if 0: + # XXX Bring me back! + if e.ref is None and e.status is None: + print('missing ref and status!') + elif e.ref != t['id'] and e.status != t['status']: + print('mismatched ref and status!') + elif e.ref is None: + print('missing ref!') + elif e.ref != t['id']: + print('mismatched ref!') + elif e.status is None: + print('missing status!') + elif e.status != t['status']: + print('mismatched status!') + break self.uncategorized['transactions'] += [t for t in transactions if t['id'] not in matched_t] From 87cbfbbbe2a1e67fe783081f625f9884065c4a18 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 21 Apr 2016 16:59:32 -0400 Subject: [PATCH 59/63] Rename uncategorized to unmatchable To be paralle to `matched` --- match-balanced.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 30f6dc912e..bb9c9109eb 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -153,7 +153,7 @@ def __init__(self, db, root): self.exchanges = get_exchanges(db) print("we have {} exchanges to match!".format(len(self.exchanges))) - self.uncategorized = {'transactions': [], 'exchanges': []} + self.unmatchable = {'transactions': [], 'exchanges': []} # Do goofiness to map cid to transactions @@ -171,22 +171,22 @@ def __init__(self, db, root): if t['status'] == 'failed' and t['created_at'] < '2014-12-18': # We didn't record failures before this date. - self.uncategorized['transactions'].append(t) + self.unmatchable['transactions'].append(t) continue if not cid: if t['kind'] != 'card_hold' or t['links__debit'] != '': - self.uncategorized['transactions'].append(t) + self.unmatchable['transactions'].append(t) continue usernames = card2usernames[t['links__card']] cids = set.union(*[username2cids[username] for username in usernames]) if len(cids) != 1: - self.uncategorized['transactions'].append(t) + self.unmatchable['transactions'].append(t) continue cid = tuple(cids)[0] if not cid: - self.uncategorized['transactions'].append(t) + self.unmatchable['transactions'].append(t) continue self.cid2transactions[cid].append(t) @@ -255,8 +255,8 @@ def inner_loop(self, cid, uid): break - self.uncategorized['transactions'] += [t for t in transactions if t['id'] not in matched_t] - self.uncategorized['exchanges'] += [e for e in exchanges if e.id not in matched_e] + self.unmatchable['transactions'] += [t for t in transactions if t['id'] not in matched_t] + self.unmatchable['exchanges'] += [e for e in exchanges if e.id not in matched_e] def main(self): @@ -359,13 +359,13 @@ def dump(self): , transaction['status'] )) - out = csv.writer(open('uncategorized.exchanges', 'w+')) - for exchange in self.uncategorized['exchanges']: + out = csv.writer(open('unmatchable.exchanges', 'w+')) + for exchange in self.unmatchable['exchanges']: rec = [x[1] for x in exchange._asdict().items()] out.writerow(rec) - out = csv.writer(open('uncategorized.transactions', 'w+')) - for transaction in self.uncategorized['transactions']: + out = csv.writer(open('unmatchable.transactions', 'w+')) + for transaction in self.unmatchable['transactions']: rec = [x[1] for x in sorted(transaction.items())] out.writerow(rec) From e18f900aca5d77b6aeebd2bab8e24c5f4c54c3f7 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 25 Apr 2016 08:07:20 -0400 Subject: [PATCH 60/63] Differentiate error cases --- match-balanced.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index bb9c9109eb..13382fbf77 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -153,7 +153,7 @@ def __init__(self, db, root): self.exchanges = get_exchanges(db) print("we have {} exchanges to match!".format(len(self.exchanges))) - self.unmatchable = {'transactions': [], 'exchanges': []} + self.unmatchable = defaultdict(list) # Do goofiness to map cid to transactions @@ -171,22 +171,33 @@ def __init__(self, db, root): if t['status'] == 'failed' and t['created_at'] < '2014-12-18': # We didn't record failures before this date. - self.unmatchable['transactions'].append(t) + self.unmatchable['early_failures'].append(t) continue if not cid: - if t['kind'] != 'card_hold' or t['links__debit'] != '': - self.unmatchable['transactions'].append(t) + # It seems that card holds don't have a customer link. + if t['kind'] != 'card_hold': + self.unmatchable['non_card_hold_without_cid'].append(t) continue + if t['links__debit'] != '': + # Though if the card hold is linked to a debit, then we'll pass here + # and just work with the debit (when we get to it) instead. + continue + if not t['links__card']: + # Okay: but the card hold *should* have a card. + self.unmatchable['card_hold_without_card'].append(t) + continue + + # Can we unambiguously determine a cid from the card? usernames = card2usernames[t['links__card']] cids = set.union(*[username2cids[username] for username in usernames]) if len(cids) != 1: - self.unmatchable['transactions'].append(t) + self.unmatchable['ambiguous_card_hold'].append(t) continue cid = tuple(cids)[0] if not cid: - self.unmatchable['transactions'].append(t) + self.unmatchable['still_no_cid'].append(t) continue self.cid2transactions[cid].append(t) @@ -255,7 +266,7 @@ def inner_loop(self, cid, uid): break - self.unmatchable['transactions'] += [t for t in transactions if t['id'] not in matched_t] + self.unmatchable['dregs'] += [t for t in transactions if t['id'] not in matched_t] self.unmatchable['exchanges'] += [e for e in exchanges if e.id not in matched_e] @@ -359,15 +370,11 @@ def dump(self): , transaction['status'] )) - out = csv.writer(open('unmatchable.exchanges', 'w+')) - for exchange in self.unmatchable['exchanges']: - rec = [x[1] for x in exchange._asdict().items()] - out.writerow(rec) - - out = csv.writer(open('unmatchable.transactions', 'w+')) - for transaction in self.unmatchable['transactions']: - rec = [x[1] for x in sorted(transaction.items())] - out.writerow(rec) + for reason in self.unmatchable: + out = csv.writer(open('unmatchable.{}'.format(reason), 'w+')) + flatten = lambda o: o._asdict().items() if reason == 'exchanges' else sorted(o.items()) + for rec in self.unmatchable[reason]: + out.writerow([kv[1] for kv in flatten(rec)]) if __name__ == '__main__': From 00bb7e31f0c2eade4d4304b8d7c72a9b2640c787 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 25 Apr 2016 09:45:02 -0400 Subject: [PATCH 61/63] Add an option to ctrl-c without dumping Also output kind for debugging --- match-balanced.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 13382fbf77..8c53b1ae65 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -368,6 +368,7 @@ def dump(self): , exchange.amount , transaction['id'] , transaction['status'] + , transaction['kind'] )) for reason in self.unmatchable: @@ -385,10 +386,16 @@ def dump(self): try: matcher.main() except: - pass + ask_before_dumping = True + else: + ask_before_dumping = False print("\nWe found {} matches!".format(len(matcher.matches))) - matcher.dump() + if ask_before_dumping: + if raw_input("Dump data so far? (y/N) ") == 'y': + matcher.dump() + else: + matcher.dump() """ From be36a2678ac071eb94d638531dcec74c4cf1362f Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Mon, 25 Apr 2016 18:55:16 -0400 Subject: [PATCH 62/63] Match a bunch of failures --- match-balanced.py | 58 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index 8c53b1ae65..e28d1bc7c4 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -16,6 +16,7 @@ class Heck(Exception): pass +SIXTY_SECONDS = datetime.timedelta(seconds=60) def datetime_from_iso(iso): @@ -241,30 +242,43 @@ def inner_loop(self, cid, uid): for t in transactions: if t['id'] in matched_t: continue + timelimit = t['timestamp'] - SIXTY_SECONDS for e in exchanges: if e.id in matched_e: continue - if e.timestamp < t['timestamp']: continue - if amounts_match(t, e) and ts_within(t, e, 6*3600): - matched_t.add(t['id']) - matched_e.add(e.id) - self.matches.append((t, e)) - - if 0: - # XXX Bring me back! - if e.ref is None and e.status is None: - print('missing ref and status!') - elif e.ref != t['id'] and e.status != t['status']: - print('mismatched ref and status!') - elif e.ref is None: - print('missing ref!') - elif e.ref != t['id']: - print('mismatched ref!') - elif e.status is None: - print('missing status!') - elif e.status != t['status']: - print('mismatched status!') - - break + if e.timestamp < timelimit or not ts_within(t, e, 6*3600): continue + if not amounts_match(t, e): + + # We appear to have recorded the nominal amount of the tip + # for failed exchanges, not the charge amount. I guess + # let's link these on the strength of the cid/uid and + # timestamp match? + + if t['status'] == 'failed': + if e.amount > t['amount']: + continue + else: + continue + + matched_t.add(t['id']) + matched_e.add(e.id) + self.matches.append((t, e)) + + if 0: + # XXX Bring me back! + if e.ref is None and e.status is None: + print('missing ref and status!') + elif e.ref != t['id'] and e.status != t['status']: + print('mismatched ref and status!') + elif e.ref is None: + print('missing ref!') + elif e.ref != t['id']: + print('mismatched ref!') + elif e.status is None: + print('missing status!') + elif e.status != t['status']: + print('mismatched status!') + + break self.unmatchable['dregs'] += [t for t in transactions if t['id'] not in matched_t] self.unmatchable['exchanges'] += [e for e in exchanges if e.id not in matched_e] From 063a8cf685dfd5387a7ff184d859394bf70a87b1 Mon Sep 17 00:00:00 2001 From: Chad Whitacre Date: Thu, 28 Apr 2016 09:58:51 -0400 Subject: [PATCH 63/63] Start looking at the exchange side --- match-balanced.py | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/match-balanced.py b/match-balanced.py index e28d1bc7c4..5ecd0d7cd6 100755 --- a/match-balanced.py +++ b/match-balanced.py @@ -55,12 +55,14 @@ def ts_within(transaction, exchange, seconds): def get_exchanges(db): return db.all("""\ - SELECT e.*, p.id as user_id - FROM exchanges e - JOIN participants p - ON e.participant = p.username - WHERE recorder IS NULL -- filter out PayPal - ORDER BY "timestamp" asc + SELECT e.*, p.id as user_id, r.network as network + FROM exchanges e + JOIN participants p + ON e.participant = p.username + LEFT OUTER JOIN exchange_routes r + ON e.route = r.id + WHERE recorder IS NULL -- filter out PayPal + ORDER BY "timestamp" asc """) @@ -243,9 +245,18 @@ def inner_loop(self, cid, uid): for t in transactions: if t['id'] in matched_t: continue timelimit = t['timestamp'] - SIXTY_SECONDS - for e in exchanges: - if e.id in matched_e: continue - if e.timestamp < timelimit or not ts_within(t, e, 6*3600): continue + i = 0 + while i < len(exchanges): + e = exchanges[i] + assert not e.id in matched_e + + # Fast-forward to an exchange that is within a certain time + # tolerance of the current transaction. + if e.timestamp < timelimit or not ts_within(t, e, 6*3600): + i += 1 + continue + + # See if the amounts match. if not amounts_match(t, e): # We appear to have recorded the nominal amount of the tip @@ -255,15 +266,24 @@ def inner_loop(self, cid, uid): if t['status'] == 'failed': if e.amount > t['amount']: + self.unmatchable['exchanges.bad-amount-failed'].append(e) + exchanges.remove(e) continue else: + self.unmatchable['exchanges.bad-amount-succeeded'].append(e) + exchanges.remove(e) continue + exchanges.remove(e) matched_t.add(t['id']) matched_e.add(e.id) self.matches.append((t, e)) if 0: + if e.network not in ('balanced-cc', 'balanced-ba', None): + self.unmatchable['exchanges.bad-network'].append(e) + exchanges.remove(e) + continue # XXX Bring me back! if e.ref is None and e.status is None: print('missing ref and status!') @@ -387,7 +407,8 @@ def dump(self): for reason in self.unmatchable: out = csv.writer(open('unmatchable.{}'.format(reason), 'w+')) - flatten = lambda o: o._asdict().items() if reason == 'exchanges' else sorted(o.items()) + is_exchanges = lambda r: r.startswith('exchanges') + flatten = lambda o: o._asdict().items() if is_exchanges(reason) else sorted(o.items()) for rec in self.unmatchable[reason]: out.writerow([kv[1] for kv in flatten(rec)])