From 50659f6f2b5481b10fd414589a4058bd96378f46 Mon Sep 17 00:00:00 2001
From: lelo <leonid.firus@outlook.com>
Date: Wed, 21 May 2025 20:21:12 +0200
Subject: [PATCH] update merge

---
 .gitignore               |   1 +
 app/app.py               | 222 +++++++++++++++++++++++----------------
 app/requirements.txt     |   3 +-
 app/templates/index.html |   1 -
 4 files changed, 135 insertions(+), 92 deletions(-)

diff --git a/.gitignore b/.gitignore
index 714cf83..92b7d73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 app/__pycache__/
+app/.flask_session/
 .env
diff --git a/app/app.py b/app/app.py
index 92ee966..4a532f2 100644
--- a/app/app.py
+++ b/app/app.py
@@ -5,24 +5,13 @@ from io import BytesIO
 from flask_session import Session
 
 app = Flask(__name__)
-app.secret_key = "your-secret-key"  # replace with a secure random key
-# Configure server-side session (filesystem) to avoid size limits in cookies
+app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw"  # replace with a secure random key
 app.config['SESSION_TYPE'] = 'filesystem'
 app.config['SESSION_FILE_DIR'] = './.flask_session/'
 Session(app)
 
-STRIPE_COLS = [
-    'Type','ID','Created','Description','Amount','Currency',
-    'Converted Amount','Fees','Net','Converted Currency',
-    'Customer Name','Customer Email','Details'
-]
-RAISENOW_COLS = [
-    'Identifikationsnummer','Erstellt','UTC-Offset','Status',
-    'Betrag','Währung','Übernommene Gebühren - Betrag',
-    'Übernommene Gebühren - Währung','Zahlungsmethode',
-    'Zahlungsanbieter','Vorname','Nachname','E-Mail-Adresse',
-    'custom_parameters.altruja_action_name','custom_parameters.altruja_custom1_code'
-]
+STRIPE_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details']
+RAISENOW_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung']
 
 
 def get_dataframe(key, cols):
@@ -37,92 +26,126 @@ def get_dataframe(key, cols):
     return df
 
 
-import pandas as pd
-
 def get_merged_df(table_name):
     """
-    Return a DataFrame for the given table_name based on stripe and raisenow inputs,
-    including a secondary merge for date tolerance of ±1 day.
+    Return a DataFrame for the given table_name based on Stripe and Raisenow inputs,
+    enforcing strict one-to-one matching with:
+      - exact same-day matches first
+      - then ±1-day fuzzy matches
+      - no pandas merge suffixes at all
+      - all original columns (including Raisenow's norm_zweck) preserved
     """
-    stripe_df = get_dataframe('stripe_import', STRIPE_COLS)
-    raisenow_df = get_dataframe('raiseNow_import', RAISENOW_COLS)
 
-    # Normalize stripe
-    stripe_df = stripe_df.query("Type == 'Charge'")
-    stripe_df['norm_date'] = pd.to_datetime(stripe_df['Created'], format='%Y-%m-%d %H:%M')
-    stripe_df['norm_amount'] = stripe_df['Amount'].astype(str).str.replace(',', '.')
-    stripe_df['norm_amount'] = stripe_df['norm_amount'].astype(float)
-    stripe_df['norm_email'] = stripe_df['Customer Email'].astype(str)
-    stripe_df['norm_name'] = stripe_df.apply(
-        lambda x: x['Customer Name'] if x.get('Customer Name') else x['Details'],
+    # --- load & normalize Stripe ---
+    stripe = (
+        get_dataframe('stripe_import', STRIPE_COLS)
+        .query("Type == 'Charge'")
+        .copy()
+    )
+    stripe['idx_stripe']  = stripe.index
+    stripe['norm_date']   = pd.to_datetime(stripe['Created'],    format='%Y-%m-%d %H:%M')
+    stripe['norm_amount'] = stripe['Amount'].astype(str).str.replace(',', '.').astype(float)
+    stripe['norm_email']  = stripe['Customer Email'].astype(str)
+    stripe['norm_name']   = stripe.apply(
+        lambda r: r['Customer Name'] or r['Details'], axis=1
+    )
+    
+    # --- load & normalize Raisenow ---
+    raisenow = (
+        get_dataframe('raiseNow_import', RAISENOW_COLS)
+        .query("Zahlungsmethode != 'paypal'")
+        .query("Status == 'succeeded'")
+        .copy()
+    )
+    
+    raisenow['idx_raisenow'] = raisenow.index
+    raisenow['norm_date']    = pd.to_datetime(raisenow['Erstellt'], format='%Y-%m-%d %H:%M')
+    raisenow['norm_amount']  = raisenow['Betrag'].astype(float)
+    raisenow['norm_email']   = raisenow['E-Mail-Adresse'].astype(str)
+    raisenow['norm_name']    = raisenow['Vorname'].astype(str) + ' ' + raisenow['Nachname'].astype(str)
+    
+    # start with two‐step assignment
+    raisenow['norm_zweck'] = raisenow.apply(
+        lambda r: r.get('custom_parameters.altruja_action_name')
+                or r.get('custom_parameters.altruja_custom1_code'),
         axis=1
     )
-
-    # Normalize raisenow
-    raisenow_df = raisenow_df.query("Zahlungsmethode != 'paypal'")
-    raisenow_df = raisenow_df.query("Status == 'succeeded'")
-    raisenow_df['norm_date'] = pd.to_datetime(raisenow_df['Erstellt'], format='%Y-%m-%d %H:%M')
-    raisenow_df['norm_amount'] = raisenow_df['Betrag'].astype(float)
-    raisenow_df['norm_name'] = (
-        raisenow_df['Vorname'].astype(str) + ' ' + raisenow_df['Nachname'].astype(str)
-    )
-    raisenow_df['norm_email'] = raisenow_df['E-Mail-Adresse'].astype(str)
-    raisenow_df['norm_zweck'] = raisenow_df.apply(
-        lambda x: x['custom_parameters.altruja_action_name']
-        if x.get('custom_parameters.altruja_action_name')
-        else x.get('custom_parameters.altruja_custom1_code'),
-        axis=1
+    # additional assignment: build a mask of rows where norm_zweck is still empty/NaN
+    mask = raisenow['norm_zweck'].isna() | (raisenow['norm_zweck'] == '')
+    raisenow.loc[mask, 'norm_zweck'] = (
+        raisenow.loc[mask, 'raisenow_parameters.product.source_url']
+            .str.extract(r'https?://[^/]+/([^/?#]+)')[0]
     )
 
-    if table_name in ('stripe_import', 'raiseNow_import'):
-        df = stripe_df if table_name == 'stripe_import' else raisenow_df
-        return df.dropna(axis=1, how='all')
+    # --- return raw tables if requested ---
+    if table_name == 'stripe_import':
+        return stripe.dropna(axis=1, how='all')
+    if table_name == 'raiseNow_import':
+        return raisenow.dropna(axis=1, how='all')
 
-    # Exact merge
-    exact = pd.merge(
-        stripe_df,
-        raisenow_df,
-        on=['norm_amount', 'norm_name'],
-        how='outer',
-        suffixes=('_stripe', '_raisenow'),
-        indicator=True
-    )
-    exact['date_diff'] = (
-        exact['norm_date_stripe'].dt.date - exact['norm_date_raisenow'].dt.date
-    ).abs()
+    # --- 1) Greedy exact same-day matches ---
+    pairs = []
+    # index Raisenow rows for fast lookup + dropping
+    rr = raisenow.set_index('idx_raisenow')
+    for _, s in stripe.iterrows():
+        # filter candidates by amount & name
+        cand = rr[
+            (rr['norm_amount'] == s['norm_amount']) &
+            (rr['norm_name']   == s['norm_name'])
+        ].copy()
+        if cand.empty:
+            continue
+        # compute absolute date difference (days only)
+        date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
+        exact_cand = cand[date_diff == pd.Timedelta(0)]
+        if not exact_cand.empty:
+            # pick the first exact match
+            best = exact_cand.index[0]
+            pairs.append((int(s['idx_stripe']), int(best)))
+            rr = rr.drop(best)
 
-    # Separate matches
-    exact_matches = exact[(exact['_merge'] == 'both') & (exact['date_diff'] == pd.Timedelta(0))].copy()
-    stripe_only = exact[exact['_merge'] == 'left_only'].copy()
-    raisenow_only = exact[exact['_merge'] == 'right_only'].copy()
+    # --- 2) Greedy fuzzy ±1-day matches on remaining rows ---
+    used_stripe = {s for s, _ in pairs}
+    stripe_left = stripe[~stripe['idx_stripe'].isin(used_stripe)].copy()
+    for _, s in stripe_left.iterrows():
+        cand = rr[
+            (rr['norm_amount'] == s['norm_amount']) &
+            (rr['norm_name']   == s['norm_name'])
+        ].copy()
+        if cand.empty:
+            continue
+        date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
+        cand = cand[date_diff <= pd.Timedelta(days=1)]
+        if cand.empty:
+            continue
+        # pick the one with the smallest gap
+        best = date_diff.idxmin()
+        pairs.append((int(s['idx_stripe']), int(best)))
+        rr = rr.drop(best)
 
-    # Fuzzy merge within ±1 day for remaining
-    # Merge stripe_only with raisenow_only on name and amount
-    fuzzy = pd.merge(
-        stripe_only.drop(columns=['_merge']),
-        raisenow_only.drop(columns=['_merge']),
-        on=['norm_amount', 'norm_name'],
-        suffixes=('_stripe', '_raisenow')
-    )
-    fuzzy['date_diff'] = (
-        fuzzy['norm_date_stripe'].dt.date - fuzzy['norm_date_raisenow'].dt.date
-    ).abs()
-    fuzzy_matches = fuzzy[fuzzy['date_diff'] <= pd.Timedelta(days=1)].copy()
+    # --- build the merged DataFrame without suffixes ---
+    merged_rows = []
+    for s_idx, r_idx in pairs:
+        srow = stripe.loc[s_idx].to_dict()
+        rrow = raisenow.loc[r_idx].to_dict()
+        # drop any overlapping keys so we never get suffixes
+        for k in ['norm_amount','norm_name','norm_date','norm_email','idx_stripe']:
+            rrow.pop(k, None)
+        # now combine so stripe values win for those keys, and raisenow adds its own columns
+        merged = {**srow, **rrow}
+        merged_rows.append(merged)
 
-    # Combine exact and fuzzy
-    combined = pd.concat([exact_matches, fuzzy_matches], ignore_index=True)
-    combined = combined.drop(columns=['_merge', 'date_diff'], errors='ignore')
+    combined = pd.DataFrame(merged_rows)
 
-    # Determine outputs
+    # --- slice out the requested view ---
     if table_name == 'merged':
         result = combined
     elif table_name == 'stripe_only':
-        # Exclude those in combined
-        matched_stripe_ids = combined['<unique_id_column>_stripe'] if '<unique_id_column>_stripe' in combined else None
-        result = stripe_df[~stripe_df.index.isin(matched_stripe_ids)]
+        used = {s for s, _ in pairs}
+        result = stripe[~stripe['idx_stripe'].isin(used)]
     elif table_name == 'raisenow_only':
-        matched_raisenow_ids = combined['<unique_id_column>_raisenow'] if '<unique_id_column>_raisenow' in combined else None
-        result = raisenow_df[~raisenow_df.index.isin(matched_raisenow_ids)]
+        used = {r for _, r in pairs}
+        result = raisenow[~raisenow['idx_raisenow'].isin(used)]
     else:
         raise ValueError(f"Unknown table_name '{table_name}'")
 
@@ -148,7 +171,6 @@ def upload():
         raw = raw.dropna(how='all').dropna(axis=1, how='all')
         raw = raw.astype(object).replace({np.nan: None})
         cols = list(raw.columns)
-
         if cols[:len(STRIPE_COLS)] == STRIPE_COLS:
             key = 'stripe_import'
             dedupe_col = 'ID'
@@ -171,10 +193,8 @@ def upload():
 @app.route('/get_table')
 def get_table():
     table = request.args.get('table')
-    try:
-        df = get_merged_df(table)
-    except Exception as e:
-        return jsonify({'error': str(e)}), 400
+    
+    df = get_merged_df(table)
 
     df = df.astype(object).where(pd.notnull(df), None)
     return jsonify({
@@ -185,13 +205,35 @@ def get_table():
 
 @app.route('/download')
 def download():
-    sheets = { name: get_merged_df(name)
-        for name in ['stripe_import','raiseNow_import','merged','stripe_only','raisenow_only'] }
+    sheets = {
+        name: get_merged_df(name)
+        for name in [
+            'stripe_import',
+            'raiseNow_import',
+            'merged',
+            'stripe_only',
+            'raisenow_only'
+        ]
+    }
 
     output = BytesIO()
     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+        workbook = writer.book
         for name, df in sheets.items():
             df.to_excel(writer, sheet_name=name, index=False)
+            worksheet = writer.sheets[name]
+
+            # 1) Freeze header row
+            worksheet.freeze_panes(1, 0)
+
+            # 2) Autofilter on the header row across all columns
+            #    (0,0) is the top-left cell; (len(df), len(df.columns)-1) covers all data rows
+            worksheet.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
+
+            # 3) Set column widths to match first-row entries
+            first_row = df.iloc[0].astype(str)
+            for col_idx, cell_value in enumerate(first_row):
+                worksheet.set_column(col_idx, col_idx, len(cell_value) + 2)
 
     output.seek(0)
     return send_file(
diff --git a/app/requirements.txt b/app/requirements.txt
index 04adec8..a9c760b 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -1,4 +1,5 @@
 Flask
 flask_session
 pandas
-openpyxl
\ No newline at end of file
+openpyxl
+xlsxwriter
\ No newline at end of file
diff --git a/app/templates/index.html b/app/templates/index.html
index d82aa4c..ef1cc69 100644
--- a/app/templates/index.html
+++ b/app/templates/index.html
@@ -70,7 +70,6 @@
       
       // error handling
       if (!resp.ok) {
-        if (table) table.hideLoader();
         return alert(json.error || 'Error loading');
       }