cdh-merger/app/app.py

336 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from flask import Flask, request, jsonify, render_template, send_file, session
import pandas as pd
import numpy as np
from io import BytesIO
from flask_session import Session
app = Flask(__name__)
app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw" # replace with a secure random key
app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw" # replace with a secure random key
app.config['SESSION_TYPE'] = 'filesystem'
app.config['SESSION_FILE_DIR'] = './.flask_session/'
Session(app)
STRIPE_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details']
RAISENOW_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung']
STRIPE_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details']
RAISENOW_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung']
def get_dataframe(key):
"""
Load a DataFrame from session.
"""
records = session.get(key, [])
if records:
df = pd.DataFrame(records)
else:
df = pd.DataFrame()
return df
def get_merged_df(table_name):
"""
Return a DataFrame for the given table_name based on Stripe and Raisenow inputs,
enforcing strict one-to-one matching with:
- exact same-day matches first
- then ±1-day fuzzy matches
- no pandas merge suffixes at all
- all original columns (including Raisenow's norm_zweck) preserved
"""
# --- load & normalize Stripe ---
stripe = get_dataframe('stripe_import')
if not stripe.empty:
stripe = (
stripe
.query("Type == 'Charge'")
.copy()
)
else:
return stripe
stripe['idx_stripe'] = stripe.index
stripe['norm_date'] = pd.to_datetime(stripe['Created'], format='%Y-%m-%d %H:%M')
stripe['norm_amount'] = stripe['Amount'].astype(str).str.replace(',', '.').astype(float)
stripe['norm_email'] = stripe['Customer Email'].astype(str)
stripe['norm_name'] = stripe.apply(
lambda r: r['Customer Name'] or r['Details'], axis=1
)
# --- load & normalize Raisenow ---
raisenow = get_dataframe('raiseNow_import')
if not raisenow.empty:
raisenow = (
raisenow
.query("Zahlungsmethode != 'paypal'")
.query("Status == 'succeeded'")
.copy()
)
else:
return raisenow
raisenow['idx_raisenow'] = raisenow.index
raisenow['norm_date'] = pd.to_datetime(raisenow['Erstellt'], format='%Y-%m-%d %H:%M')
raisenow['norm_amount'] = raisenow['Betrag'].astype(float)
raisenow['norm_email'] = raisenow['E-Mail-Adresse'].astype(str)
raisenow['norm_name'] = raisenow['Vorname'].astype(str) + ' ' + raisenow['Nachname'].astype(str)
# start with twostep assignment
raisenow['norm_zweck'] = raisenow.apply(
lambda r: r.get('custom_parameters.altruja_action_name')
or r.get('custom_parameters.altruja_custom1_code'),
axis=1
)
# additional assignment: build a mask of rows where norm_zweck is still empty/NaN
mask = raisenow['norm_zweck'].isna() | (raisenow['norm_zweck'] == '')
raisenow.loc[mask, 'norm_zweck'] = (
raisenow.loc[mask, 'raisenow_parameters.product.source_url']
.str.extract(r'https?://[^/]+/([^/?#]+)')[0]
)
# --- return raw tables if requested ---
if table_name == 'stripe_import':
return stripe.dropna(axis=1, how='all')
if table_name == 'raiseNow_import':
return raisenow.dropna(axis=1, how='all')
# additional assignment: build a mask of rows where norm_zweck is still empty/NaN
mask = raisenow['norm_zweck'].isna() | (raisenow['norm_zweck'] == '')
raisenow.loc[mask, 'norm_zweck'] = (
raisenow.loc[mask, 'raisenow_parameters.product.source_url']
.str.extract(r'https?://[^/]+/([^/?#]+)')[0]
)
# --- return raw tables if requested ---
if table_name == 'stripe_import':
return stripe.dropna(axis=1, how='all')
if table_name == 'raiseNow_import':
return raisenow.dropna(axis=1, how='all')
# --- 1) Greedy exact same-day matches ---
pairs = []
# index Raisenow rows for fast lookup + dropping
rr = raisenow.set_index('idx_raisenow')
for _, s in stripe.iterrows():
# filter candidates by amount & name
cand = rr[
(rr['norm_amount'] == s['norm_amount']) &
(rr['norm_name'] == s['norm_name'])
].copy()
if cand.empty:
continue
# compute absolute date difference (days only)
date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
exact_cand = cand[date_diff == pd.Timedelta(0)]
if not exact_cand.empty:
# pick the first exact match
best = exact_cand.index[0]
pairs.append((int(s['idx_stripe']), int(best)))
rr = rr.drop(best)
# --- 1) Greedy exact same-day matches ---
pairs = []
# index Raisenow rows for fast lookup + dropping
rr = raisenow.set_index('idx_raisenow')
for _, s in stripe.iterrows():
# filter candidates by amount & name
cand = rr[
(rr['norm_amount'] == s['norm_amount']) &
(rr['norm_name'] == s['norm_name'])
].copy()
if cand.empty:
continue
# compute absolute date difference (days only)
date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
exact_cand = cand[date_diff == pd.Timedelta(0)]
if not exact_cand.empty:
# pick the first exact match
best = exact_cand.index[0]
pairs.append((int(s['idx_stripe']), int(best)))
rr = rr.drop(best)
# --- 2) Greedy fuzzy ±1-day matches on remaining rows ---
used_stripe = {s for s, _ in pairs}
stripe_left = stripe[~stripe['idx_stripe'].isin(used_stripe)].copy()
for _, s in stripe_left.iterrows():
cand = rr[
(rr['norm_amount'] == s['norm_amount']) &
(rr['norm_name'] == s['norm_name'])
].copy()
if cand.empty:
continue
date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
cand = cand[date_diff <= pd.Timedelta(days=1)]
if cand.empty:
continue
# pick the one with the smallest gap
best = date_diff.idxmin()
pairs.append((int(s['idx_stripe']), int(best)))
rr = rr.drop(best)
# --- build the merged DataFrame without suffixes ---
merged_rows = []
for s_idx, r_idx in pairs:
srow = stripe.loc[s_idx].to_dict()
rrow = raisenow.loc[r_idx].to_dict()
# drop any overlapping keys so we never get suffixes
for k in ['norm_amount','norm_name','norm_date','norm_email','idx_stripe']:
rrow.pop(k, None)
# now combine so stripe values win for those keys, and raisenow adds its own columns
merged = {**srow, **rrow}
merged_rows.append(merged)
combined = pd.DataFrame(merged_rows)
# --- slice out the requested view ---
# --- 2) Greedy fuzzy ±1-day matches on remaining rows ---
used_stripe = {s for s, _ in pairs}
stripe_left = stripe[~stripe['idx_stripe'].isin(used_stripe)].copy()
for _, s in stripe_left.iterrows():
cand = rr[
(rr['norm_amount'] == s['norm_amount']) &
(rr['norm_name'] == s['norm_name'])
].copy()
if cand.empty:
continue
date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
cand = cand[date_diff <= pd.Timedelta(days=1)]
if cand.empty:
continue
# pick the one with the smallest gap
best = date_diff.idxmin()
pairs.append((int(s['idx_stripe']), int(best)))
rr = rr.drop(best)
# --- build the merged DataFrame without suffixes ---
merged_rows = []
for s_idx, r_idx in pairs:
srow = stripe.loc[s_idx].to_dict()
rrow = raisenow.loc[r_idx].to_dict()
# drop any overlapping keys so we never get suffixes
for k in ['norm_amount','norm_name','norm_date','norm_email','idx_stripe']:
rrow.pop(k, None)
# now combine so stripe values win for those keys, and raisenow adds its own columns
merged = {**srow, **rrow}
merged_rows.append(merged)
combined = pd.DataFrame(merged_rows)
# --- slice out the requested view ---
if table_name == 'merged':
result = combined
elif table_name == 'stripe_only':
used = {s for s, _ in pairs}
result = stripe[~stripe['idx_stripe'].isin(used)]
elif table_name == 'raisenow_only':
used = {r for _, r in pairs}
result = raisenow[~raisenow['idx_raisenow'].isin(used)]
else:
raise ValueError(f"Unknown table_name '{table_name}'")
return result.dropna(axis=1, how='all')
@app.route('/')
def index():
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload():
files = request.files.getlist('files')
if not files:
return jsonify({'error': 'No files uploaded'}), 400
for f in files:
raw = (
pd.read_csv(f) if f.filename.lower().endswith('.csv') else pd.read_excel(f)
)
raw = raw.dropna(how='all').dropna(axis=1, how='all')
raw = raw.astype(object).replace({np.nan: None})
cols = list(raw.columns)
if cols[:len(STRIPE_COLS)] == STRIPE_COLS:
key = 'stripe_import'
dedupe_col = 'ID'
elif cols[:len(RAISENOW_COLS)] == RAISENOW_COLS:
key = 'raiseNow_import'
dedupe_col = 'Identifikationsnummer'
else:
continue
existing = get_dataframe(key)
combined = pd.concat([existing, raw], ignore_index=True)
deduped = combined.drop_duplicates(subset=[dedupe_col], keep='first').reset_index(drop=True)
# Save back to session
session[key] = deduped.astype(object).where(pd.notnull(deduped), None).to_dict(orient='records')
return jsonify({'status': 'ok'})
@app.route('/get_table')
def get_table():
table = request.args.get('table')
df = get_merged_df(table)
df = df.astype(object).where(pd.notnull(df), None)
return jsonify({
'columns': list(df.columns),
'data': df.to_dict(orient='records')
})
@app.route('/download')
def download():
sheets = {
name: get_merged_df(name)
for name in [
'stripe_import',
'raiseNow_import',
'merged',
'stripe_only',
'raisenow_only'
]
}
sheets = {
name: get_merged_df(name)
for name in [
'stripe_import',
'raiseNow_import',
'merged',
'stripe_only',
'raisenow_only'
]
}
output = BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
workbook = writer.book
for name, df in sheets.items():
df.to_excel(writer, sheet_name=name, index=False)
worksheet = writer.sheets[name]
# 1) Freeze header row
worksheet.freeze_panes(1, 0)
# 2) Autofilter on the header row across all columns
# (0,0) is the top-left cell; (len(df), len(df.columns)-1) covers all data rows
worksheet.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
# 3) Set column widths to match first-row entries
first_row = df.iloc[0].astype(str)
for col_idx, cell_value in enumerate(first_row):
worksheet.set_column(col_idx, col_idx, len(cell_value) + 2)
output.seek(0)
return send_file(
output,
as_attachment=True,
download_name='all_tables.xlsx',
mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
if __name__ == '__main__':
app.run(debug=True)