diff --git a/app/app.py b/app/app.py index 2e2f710..3689ca0 100644 --- a/app/app.py +++ b/app/app.py @@ -6,12 +6,15 @@ from flask_session import Session app = Flask(__name__) app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw" # replace with a secure random key +app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw" # replace with a secure random key app.config['SESSION_TYPE'] = 'filesystem' app.config['SESSION_FILE_DIR'] = './.flask_session/' Session(app) STRIPE_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details'] RAISENOW_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung'] +STRIPE_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details'] +RAISENOW_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung'] def get_dataframe(key): @@ -86,6 +89,18 @@ def get_merged_df(table_name): .str.extract(r'https?://[^/]+/([^/?#]+)')[0] ) + # --- return raw tables if requested --- + if table_name == 'stripe_import': + return stripe.dropna(axis=1, how='all') + if table_name == 'raiseNow_import': + return raisenow.dropna(axis=1, how='all') + # additional assignment: build a mask of rows where norm_zweck is still empty/NaN + mask = raisenow['norm_zweck'].isna() | (raisenow['norm_zweck'] == '') + raisenow.loc[mask, 'norm_zweck'] = ( + raisenow.loc[mask, 'raisenow_parameters.product.source_url'] + .str.extract(r'https?://[^/]+/([^/?#]+)')[0] + ) + # --- return raw tables if requested --- if table_name == 'stripe_import': return stripe.dropna(axis=1, how='all') @@ -96,6 +111,26 @@ def get_merged_df(table_name): pairs = [] # index Raisenow rows for fast lookup + dropping rr = raisenow.set_index('idx_raisenow') + for _, s in stripe.iterrows(): + # filter candidates by amount & name + cand = rr[ + (rr['norm_amount'] == s['norm_amount']) & + (rr['norm_name'] == s['norm_name']) + ].copy() + if cand.empty: + continue + # compute absolute date difference (days only) + date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs() + exact_cand = cand[date_diff == pd.Timedelta(0)] + if not exact_cand.empty: + # pick the first exact match + best = exact_cand.index[0] + pairs.append((int(s['idx_stripe']), int(best))) + rr = rr.drop(best) + # --- 1) Greedy exact same-day matches --- + pairs = [] + # index Raisenow rows for fast lookup + dropping + rr = raisenow.set_index('idx_raisenow') for _, s in stripe.iterrows(): # filter candidates by amount & name cand = rr[ @@ -146,6 +181,40 @@ def get_merged_df(table_name): combined = pd.DataFrame(merged_rows) + # --- slice out the requested view --- + # --- 2) Greedy fuzzy ±1-day matches on remaining rows --- + used_stripe = {s for s, _ in pairs} + stripe_left = stripe[~stripe['idx_stripe'].isin(used_stripe)].copy() + for _, s in stripe_left.iterrows(): + cand = rr[ + (rr['norm_amount'] == s['norm_amount']) & + (rr['norm_name'] == s['norm_name']) + ].copy() + if cand.empty: + continue + date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs() + cand = cand[date_diff <= pd.Timedelta(days=1)] + if cand.empty: + continue + # pick the one with the smallest gap + best = date_diff.idxmin() + pairs.append((int(s['idx_stripe']), int(best))) + rr = rr.drop(best) + + # --- build the merged DataFrame without suffixes --- + merged_rows = [] + for s_idx, r_idx in pairs: + srow = stripe.loc[s_idx].to_dict() + rrow = raisenow.loc[r_idx].to_dict() + # drop any overlapping keys so we never get suffixes + for k in ['norm_amount','norm_name','norm_date','norm_email','idx_stripe']: + rrow.pop(k, None) + # now combine so stripe values win for those keys, and raisenow adds its own columns + merged = {**srow, **rrow} + merged_rows.append(merged) + + combined = pd.DataFrame(merged_rows) + # --- slice out the requested view --- if table_name == 'merged': result = combined @@ -224,6 +293,16 @@ def download(): 'raisenow_only' ] } + sheets = { + name: get_merged_df(name) + for name in [ + 'stripe_import', + 'raiseNow_import', + 'merged', + 'stripe_only', + 'raisenow_only' + ] + } output = BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: