cdh-merger/app/app.py

from flask import Flask, request, jsonify, render_template, send_file, session
import pandas as pd
import numpy as np
from io import BytesIO
from flask_session import Session

app = Flask(__name__)
app.secret_key = "your-secret-key"  # replace with a secure random key
# Configure server-side session (filesystem) to avoid size limits in cookies
app.config['SESSION_TYPE'] = 'filesystem'
app.config['SESSION_FILE_DIR'] = './.flask_session/'
Session(app)

STRIPE_COLS = [
    'Type','ID','Created','Description','Amount','Currency',
    'Converted Amount','Fees','Net','Converted Currency',
    'Customer Name','Customer Email','Details'
]
RAISENOW_COLS = [
    'Identifikationsnummer','Erstellt','UTC-Offset','Status',
    'Betrag','Währung','Übernommene Gebühren - Betrag',
    'Übernommene Gebühren - Währung','Zahlungsmethode',
    'Zahlungsanbieter','Vorname','Nachname','E-Mail-Adresse',
    'custom_parameters.altruja_action_name','custom_parameters.altruja_custom1_code'
]


def get_dataframe(key, cols):
    """
    Load a DataFrame from session or create an empty one with the given columns.
    """
    records = session.get(key, [])
    if records:
        df = pd.DataFrame(records)
    else:
        df = pd.DataFrame(columns=cols)
    return df


import pandas as pd

def get_merged_df(table_name):
    """
    Return a DataFrame for the given table_name based on stripe and raisenow inputs,
    including a secondary merge for date tolerance of ±1 day.
    """
    stripe_df = get_dataframe('stripe_import', STRIPE_COLS)
    raisenow_df = get_dataframe('raiseNow_import', RAISENOW_COLS)

    # Normalize stripe
    stripe_df = stripe_df.query("Type == 'Charge'")
    stripe_df['norm_date'] = pd.to_datetime(stripe_df['Created'], format='%Y-%m-%d %H:%M')
    stripe_df['norm_amount'] = stripe_df['Amount'].astype(str).str.replace(',', '.')
    stripe_df['norm_amount'] = stripe_df['norm_amount'].astype(float)
    stripe_df['norm_email'] = stripe_df['Customer Email'].astype(str)
    stripe_df['norm_name'] = stripe_df.apply(
        lambda x: x['Customer Name'] if x.get('Customer Name') else x['Details'],
        axis=1
    )

    # Normalize raisenow
    raisenow_df = raisenow_df.query("Zahlungsmethode != 'paypal'")
    raisenow_df = raisenow_df.query("Status == 'succeeded'")
    raisenow_df['norm_date'] = pd.to_datetime(raisenow_df['Erstellt'], format='%Y-%m-%d %H:%M')
    raisenow_df['norm_amount'] = raisenow_df['Betrag'].astype(float)
    raisenow_df['norm_name'] = (
        raisenow_df['Vorname'].astype(str) + ' ' + raisenow_df['Nachname'].astype(str)
    )
    raisenow_df['norm_email'] = raisenow_df['E-Mail-Adresse'].astype(str)
    raisenow_df['norm_zweck'] = raisenow_df.apply(
        lambda x: x['custom_parameters.altruja_action_name']
        if x.get('custom_parameters.altruja_action_name')
        else x.get('custom_parameters.altruja_custom1_code'),
        axis=1
    )

    if table_name in ('stripe_import', 'raiseNow_import'):
        df = stripe_df if table_name == 'stripe_import' else raisenow_df
        return df.dropna(axis=1, how='all')

    # Exact merge
    exact = pd.merge(
        stripe_df,
        raisenow_df,
        on=['norm_amount', 'norm_name'],
        how='outer',
        suffixes=('_stripe', '_raisenow'),
        indicator=True
    )
    exact['date_diff'] = (
        exact['norm_date_stripe'].dt.date - exact['norm_date_raisenow'].dt.date
    ).abs()

    # Separate matches
    exact_matches = exact[(exact['_merge'] == 'both') & (exact['date_diff'] == pd.Timedelta(0))].copy()
    stripe_only = exact[exact['_merge'] == 'left_only'].copy()
    raisenow_only = exact[exact['_merge'] == 'right_only'].copy()

    # Fuzzy merge within ±1 day for remaining
    # Merge stripe_only with raisenow_only on name and amount
    fuzzy = pd.merge(
        stripe_only.drop(columns=['_merge']),
        raisenow_only.drop(columns=['_merge']),
        on=['norm_amount', 'norm_name'],
        suffixes=('_stripe', '_raisenow')
    )
    fuzzy['date_diff'] = (
        fuzzy['norm_date_stripe'].dt.date - fuzzy['norm_date_raisenow'].dt.date
    ).abs()
    fuzzy_matches = fuzzy[fuzzy['date_diff'] <= pd.Timedelta(days=1)].copy()

    # Combine exact and fuzzy
    combined = pd.concat([exact_matches, fuzzy_matches], ignore_index=True)
    combined = combined.drop(columns=['_merge', 'date_diff'], errors='ignore')

    # Determine outputs
    if table_name == 'merged':
        result = combined
    elif table_name == 'stripe_only':
        # Exclude those in combined
        matched_stripe_ids = combined['<unique_id_column>_stripe'] if '<unique_id_column>_stripe' in combined else None
        result = stripe_df[~stripe_df.index.isin(matched_stripe_ids)]
    elif table_name == 'raisenow_only':
        matched_raisenow_ids = combined['<unique_id_column>_raisenow'] if '<unique_id_column>_raisenow' in combined else None
        result = raisenow_df[~raisenow_df.index.isin(matched_raisenow_ids)]
    else:
        raise ValueError(f"Unknown table_name '{table_name}'")

    return result.dropna(axis=1, how='all')


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/upload', methods=['POST'])
def upload():
    files = request.files.getlist('files')
    if not files:
        return jsonify({'error': 'No files uploaded'}), 400

    for f in files:
        raw = (
            pd.read_csv(f) if f.filename.lower().endswith('.csv') else pd.read_excel(f)
        )
        raw = raw.dropna(how='all').dropna(axis=1, how='all')
        raw = raw.astype(object).replace({np.nan: None})
        cols = list(raw.columns)

        if cols[:len(STRIPE_COLS)] == STRIPE_COLS:
            key = 'stripe_import'
            dedupe_col = 'ID'
        elif cols[:len(RAISENOW_COLS)] == RAISENOW_COLS:
            key = 'raiseNow_import'
            dedupe_col = 'Identifikationsnummer'
        else:
            continue

        existing = get_dataframe(key, [])
        combined = pd.concat([existing, raw], ignore_index=True)
        deduped = combined.drop_duplicates(subset=[dedupe_col], keep='first').reset_index(drop=True)

        # Save back to session
        session[key] = deduped.astype(object).where(pd.notnull(deduped), None).to_dict(orient='records')

    return jsonify({'status': 'ok'})


@app.route('/get_table')
def get_table():
    table = request.args.get('table')
    try:
        df = get_merged_df(table)
    except Exception as e:
        return jsonify({'error': str(e)}), 400

    df = df.astype(object).where(pd.notnull(df), None)
    return jsonify({
        'columns': list(df.columns),
        'data': df.to_dict(orient='records')
    })


@app.route('/download')
def download():
    sheets = { name: get_merged_df(name)
        for name in ['stripe_import','raiseNow_import','merged','stripe_only','raisenow_only'] }

    output = BytesIO()
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        for name, df in sheets.items():
            df.to_excel(writer, sheet_name=name, index=False)

    output.seek(0)
    return send_file(
        output,
        as_attachment=True,
        download_name='all_tables.xlsx',
        mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    )

if __name__ == '__main__':
    app.run(debug=True)