From 50659f6f2b5481b10fd414589a4058bd96378f46 Mon Sep 17 00:00:00 2001 From: lelo Date: Wed, 21 May 2025 20:21:12 +0200 Subject: [PATCH] update merge --- .gitignore | 1 + app/app.py | 222 +++++++++++++++++++++++---------------- app/requirements.txt | 3 +- app/templates/index.html | 1 - 4 files changed, 135 insertions(+), 92 deletions(-) diff --git a/.gitignore b/.gitignore index 714cf83..92b7d73 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ app/__pycache__/ +app/.flask_session/ .env diff --git a/app/app.py b/app/app.py index 92ee966..4a532f2 100644 --- a/app/app.py +++ b/app/app.py @@ -5,24 +5,13 @@ from io import BytesIO from flask_session import Session app = Flask(__name__) -app.secret_key = "your-secret-key" # replace with a secure random key -# Configure server-side session (filesystem) to avoid size limits in cookies +app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw" # replace with a secure random key app.config['SESSION_TYPE'] = 'filesystem' app.config['SESSION_FILE_DIR'] = './.flask_session/' Session(app) -STRIPE_COLS = [ - 'Type','ID','Created','Description','Amount','Currency', - 'Converted Amount','Fees','Net','Converted Currency', - 'Customer Name','Customer Email','Details' -] -RAISENOW_COLS = [ - 'Identifikationsnummer','Erstellt','UTC-Offset','Status', - 'Betrag','Währung','Übernommene Gebühren - Betrag', - 'Übernommene Gebühren - Währung','Zahlungsmethode', - 'Zahlungsanbieter','Vorname','Nachname','E-Mail-Adresse', - 'custom_parameters.altruja_action_name','custom_parameters.altruja_custom1_code' -] +STRIPE_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details'] +RAISENOW_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung'] def get_dataframe(key, cols): @@ -37,92 +26,126 @@ def get_dataframe(key, cols): return df -import pandas as pd - def get_merged_df(table_name): """ - Return a DataFrame for the given table_name based on stripe and raisenow inputs, - including a secondary merge for date tolerance of ±1 day. + Return a DataFrame for the given table_name based on Stripe and Raisenow inputs, + enforcing strict one-to-one matching with: + - exact same-day matches first + - then ±1-day fuzzy matches + - no pandas merge suffixes at all + - all original columns (including Raisenow's norm_zweck) preserved """ - stripe_df = get_dataframe('stripe_import', STRIPE_COLS) - raisenow_df = get_dataframe('raiseNow_import', RAISENOW_COLS) - # Normalize stripe - stripe_df = stripe_df.query("Type == 'Charge'") - stripe_df['norm_date'] = pd.to_datetime(stripe_df['Created'], format='%Y-%m-%d %H:%M') - stripe_df['norm_amount'] = stripe_df['Amount'].astype(str).str.replace(',', '.') - stripe_df['norm_amount'] = stripe_df['norm_amount'].astype(float) - stripe_df['norm_email'] = stripe_df['Customer Email'].astype(str) - stripe_df['norm_name'] = stripe_df.apply( - lambda x: x['Customer Name'] if x.get('Customer Name') else x['Details'], + # --- load & normalize Stripe --- + stripe = ( + get_dataframe('stripe_import', STRIPE_COLS) + .query("Type == 'Charge'") + .copy() + ) + stripe['idx_stripe'] = stripe.index + stripe['norm_date'] = pd.to_datetime(stripe['Created'], format='%Y-%m-%d %H:%M') + stripe['norm_amount'] = stripe['Amount'].astype(str).str.replace(',', '.').astype(float) + stripe['norm_email'] = stripe['Customer Email'].astype(str) + stripe['norm_name'] = stripe.apply( + lambda r: r['Customer Name'] or r['Details'], axis=1 + ) + + # --- load & normalize Raisenow --- + raisenow = ( + get_dataframe('raiseNow_import', RAISENOW_COLS) + .query("Zahlungsmethode != 'paypal'") + .query("Status == 'succeeded'") + .copy() + ) + + raisenow['idx_raisenow'] = raisenow.index + raisenow['norm_date'] = pd.to_datetime(raisenow['Erstellt'], format='%Y-%m-%d %H:%M') + raisenow['norm_amount'] = raisenow['Betrag'].astype(float) + raisenow['norm_email'] = raisenow['E-Mail-Adresse'].astype(str) + raisenow['norm_name'] = raisenow['Vorname'].astype(str) + ' ' + raisenow['Nachname'].astype(str) + + # start with two‐step assignment + raisenow['norm_zweck'] = raisenow.apply( + lambda r: r.get('custom_parameters.altruja_action_name') + or r.get('custom_parameters.altruja_custom1_code'), axis=1 ) - - # Normalize raisenow - raisenow_df = raisenow_df.query("Zahlungsmethode != 'paypal'") - raisenow_df = raisenow_df.query("Status == 'succeeded'") - raisenow_df['norm_date'] = pd.to_datetime(raisenow_df['Erstellt'], format='%Y-%m-%d %H:%M') - raisenow_df['norm_amount'] = raisenow_df['Betrag'].astype(float) - raisenow_df['norm_name'] = ( - raisenow_df['Vorname'].astype(str) + ' ' + raisenow_df['Nachname'].astype(str) - ) - raisenow_df['norm_email'] = raisenow_df['E-Mail-Adresse'].astype(str) - raisenow_df['norm_zweck'] = raisenow_df.apply( - lambda x: x['custom_parameters.altruja_action_name'] - if x.get('custom_parameters.altruja_action_name') - else x.get('custom_parameters.altruja_custom1_code'), - axis=1 + # additional assignment: build a mask of rows where norm_zweck is still empty/NaN + mask = raisenow['norm_zweck'].isna() | (raisenow['norm_zweck'] == '') + raisenow.loc[mask, 'norm_zweck'] = ( + raisenow.loc[mask, 'raisenow_parameters.product.source_url'] + .str.extract(r'https?://[^/]+/([^/?#]+)')[0] ) - if table_name in ('stripe_import', 'raiseNow_import'): - df = stripe_df if table_name == 'stripe_import' else raisenow_df - return df.dropna(axis=1, how='all') + # --- return raw tables if requested --- + if table_name == 'stripe_import': + return stripe.dropna(axis=1, how='all') + if table_name == 'raiseNow_import': + return raisenow.dropna(axis=1, how='all') - # Exact merge - exact = pd.merge( - stripe_df, - raisenow_df, - on=['norm_amount', 'norm_name'], - how='outer', - suffixes=('_stripe', '_raisenow'), - indicator=True - ) - exact['date_diff'] = ( - exact['norm_date_stripe'].dt.date - exact['norm_date_raisenow'].dt.date - ).abs() + # --- 1) Greedy exact same-day matches --- + pairs = [] + # index Raisenow rows for fast lookup + dropping + rr = raisenow.set_index('idx_raisenow') + for _, s in stripe.iterrows(): + # filter candidates by amount & name + cand = rr[ + (rr['norm_amount'] == s['norm_amount']) & + (rr['norm_name'] == s['norm_name']) + ].copy() + if cand.empty: + continue + # compute absolute date difference (days only) + date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs() + exact_cand = cand[date_diff == pd.Timedelta(0)] + if not exact_cand.empty: + # pick the first exact match + best = exact_cand.index[0] + pairs.append((int(s['idx_stripe']), int(best))) + rr = rr.drop(best) - # Separate matches - exact_matches = exact[(exact['_merge'] == 'both') & (exact['date_diff'] == pd.Timedelta(0))].copy() - stripe_only = exact[exact['_merge'] == 'left_only'].copy() - raisenow_only = exact[exact['_merge'] == 'right_only'].copy() + # --- 2) Greedy fuzzy ±1-day matches on remaining rows --- + used_stripe = {s for s, _ in pairs} + stripe_left = stripe[~stripe['idx_stripe'].isin(used_stripe)].copy() + for _, s in stripe_left.iterrows(): + cand = rr[ + (rr['norm_amount'] == s['norm_amount']) & + (rr['norm_name'] == s['norm_name']) + ].copy() + if cand.empty: + continue + date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs() + cand = cand[date_diff <= pd.Timedelta(days=1)] + if cand.empty: + continue + # pick the one with the smallest gap + best = date_diff.idxmin() + pairs.append((int(s['idx_stripe']), int(best))) + rr = rr.drop(best) - # Fuzzy merge within ±1 day for remaining - # Merge stripe_only with raisenow_only on name and amount - fuzzy = pd.merge( - stripe_only.drop(columns=['_merge']), - raisenow_only.drop(columns=['_merge']), - on=['norm_amount', 'norm_name'], - suffixes=('_stripe', '_raisenow') - ) - fuzzy['date_diff'] = ( - fuzzy['norm_date_stripe'].dt.date - fuzzy['norm_date_raisenow'].dt.date - ).abs() - fuzzy_matches = fuzzy[fuzzy['date_diff'] <= pd.Timedelta(days=1)].copy() + # --- build the merged DataFrame without suffixes --- + merged_rows = [] + for s_idx, r_idx in pairs: + srow = stripe.loc[s_idx].to_dict() + rrow = raisenow.loc[r_idx].to_dict() + # drop any overlapping keys so we never get suffixes + for k in ['norm_amount','norm_name','norm_date','norm_email','idx_stripe']: + rrow.pop(k, None) + # now combine so stripe values win for those keys, and raisenow adds its own columns + merged = {**srow, **rrow} + merged_rows.append(merged) - # Combine exact and fuzzy - combined = pd.concat([exact_matches, fuzzy_matches], ignore_index=True) - combined = combined.drop(columns=['_merge', 'date_diff'], errors='ignore') + combined = pd.DataFrame(merged_rows) - # Determine outputs + # --- slice out the requested view --- if table_name == 'merged': result = combined elif table_name == 'stripe_only': - # Exclude those in combined - matched_stripe_ids = combined['_stripe'] if '_stripe' in combined else None - result = stripe_df[~stripe_df.index.isin(matched_stripe_ids)] + used = {s for s, _ in pairs} + result = stripe[~stripe['idx_stripe'].isin(used)] elif table_name == 'raisenow_only': - matched_raisenow_ids = combined['_raisenow'] if '_raisenow' in combined else None - result = raisenow_df[~raisenow_df.index.isin(matched_raisenow_ids)] + used = {r for _, r in pairs} + result = raisenow[~raisenow['idx_raisenow'].isin(used)] else: raise ValueError(f"Unknown table_name '{table_name}'") @@ -148,7 +171,6 @@ def upload(): raw = raw.dropna(how='all').dropna(axis=1, how='all') raw = raw.astype(object).replace({np.nan: None}) cols = list(raw.columns) - if cols[:len(STRIPE_COLS)] == STRIPE_COLS: key = 'stripe_import' dedupe_col = 'ID' @@ -171,10 +193,8 @@ def upload(): @app.route('/get_table') def get_table(): table = request.args.get('table') - try: - df = get_merged_df(table) - except Exception as e: - return jsonify({'error': str(e)}), 400 + + df = get_merged_df(table) df = df.astype(object).where(pd.notnull(df), None) return jsonify({ @@ -185,13 +205,35 @@ def get_table(): @app.route('/download') def download(): - sheets = { name: get_merged_df(name) - for name in ['stripe_import','raiseNow_import','merged','stripe_only','raisenow_only'] } + sheets = { + name: get_merged_df(name) + for name in [ + 'stripe_import', + 'raiseNow_import', + 'merged', + 'stripe_only', + 'raisenow_only' + ] + } output = BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: + workbook = writer.book for name, df in sheets.items(): df.to_excel(writer, sheet_name=name, index=False) + worksheet = writer.sheets[name] + + # 1) Freeze header row + worksheet.freeze_panes(1, 0) + + # 2) Autofilter on the header row across all columns + # (0,0) is the top-left cell; (len(df), len(df.columns)-1) covers all data rows + worksheet.autofilter(0, 0, df.shape[0], df.shape[1] - 1) + + # 3) Set column widths to match first-row entries + first_row = df.iloc[0].astype(str) + for col_idx, cell_value in enumerate(first_row): + worksheet.set_column(col_idx, col_idx, len(cell_value) + 2) output.seek(0) return send_file( diff --git a/app/requirements.txt b/app/requirements.txt index 04adec8..a9c760b 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,4 +1,5 @@ Flask flask_session pandas -openpyxl \ No newline at end of file +openpyxl +xlsxwriter \ No newline at end of file diff --git a/app/templates/index.html b/app/templates/index.html index d82aa4c..ef1cc69 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -70,7 +70,6 @@ // error handling if (!resp.ok) { - if (table) table.hideLoader(); return alert(json.error || 'Error loading'); }