cdh-merger/app/app.py
2025-05-23 23:51:48 +02:00

497 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from flask import Flask, request, jsonify, render_template, send_file, session
import pandas as pd
import numpy as np
from io import BytesIO
from flask_session import Session
from datetime import datetime
app = Flask(__name__)
app.secret_key = "gfbierpf934hftrntr45otgß45890tfh34gft45rw" # replace with a secure random key
app.config['SESSION_TYPE'] = 'filesystem'
app.config['SESSION_FILE_DIR'] = './.flask_session/'
Session(app)
STRIPE_STARTING_COLS = ['Type', 'ID', 'Created', 'Description', 'Amount', 'Currency', 'Converted Amount', 'Fees', 'Net', 'Converted Currency', 'Details']
RAISENOW_STARTING_COLS = ['Identifikationsnummer', 'Erstellt', 'UTC-Offset', 'Status', 'Betrag', 'Währung', 'Übernommene Gebühren - Betrag', 'Übernommene Gebühren - Währung', 'Zahlungsmethode', 'Zahlungsanbieter', 'Nettobetrag', 'Auszahlungswährung']
def get_dataframe(key):
"""
Load a DataFrame from session.
"""
records = session.get(key, [])
if records:
df = pd.DataFrame(records)
else:
df = pd.DataFrame()
return df
def get_merged_df(table_name):
"""
Return a DataFrame for the given table_name based on Stripe and Raisenow inputs,
enforcing strict one-to-one matching with:
- exact same-day matches first
- then ±1-day fuzzy matches
- no pandas merge suffixes at all
- all original columns (including Raisenow's norm_zweck) preserved
"""
# --- load & normalize Stripe ---
stripe_import = get_dataframe('stripe_import')
if stripe_import.empty:
return pd.DataFrame()
stripe_charge = (
stripe_import
.query("Type == 'Charge'")
.copy()
)
stripe_adjustment = (
stripe_import
.query("Type == 'Adjustment'")
.copy()
)
stripe_refund = (
stripe_import
.query("Type == 'Payment Failure Refund'")
.copy()
)
stripe_stripeFee = (
stripe_import
.query("Type == 'Stripe Fee'")
.copy()
)
# sum up the fees
total_stripe_charge_fees = stripe_charge['Fees'].astype(str).str.replace(',', '.').astype(float).sum()
total_stripe_adjustment_fees = stripe_adjustment['Fees'].astype(str).str.replace(',', '.').astype(float).sum()
total_stripe_refund_fees = stripe_refund['Fees'].astype(str).str.replace(',', '.').astype(float).sum()
total_stripe_stripeFee_fees = stripe_stripeFee['Fees'].astype(str).str.replace(',', '.').astype(float).sum()
stripe_adjustment['norm_date'] = pd.to_datetime(stripe_adjustment['Created'], format='%Y-%m-%d %H:%M')
stripe_adjustment['norm_amount'] = stripe_adjustment['Amount'].astype(str).str.replace(',', '.').astype(float)
stripe_adjustment['norm_zweck'] = "Korrekturen"
stripe_stripeFee['norm_date'] = pd.to_datetime(stripe_stripeFee['Created'], format='%Y-%m-%d %H:%M')
stripe_stripeFee['norm_amount'] = stripe_stripeFee['Amount'].astype(str).str.replace(',', '.').astype(float)
stripe_stripeFee['norm_zweck'] = "Stripe"
# Extract the “py_…” token from stripe_refund description
stripe_refund['norm_payment_id'] = stripe_refund['Description'].str.extract(r'(py_[A-Za-z0-9]+)')
# Build a list of all extracted py_ IDs
pyids = stripe_refund['norm_payment_id'].dropna().unique().tolist()
# Remove from stripe_charge any row whose ID is in that list
stripe_charge = stripe_charge[~stripe_charge['ID'].isin(pyids)]
stripe_charge['idx_stripe'] = stripe_charge.index
stripe_charge['norm_date'] = pd.to_datetime(stripe_charge['Created'], format='%Y-%m-%d %H:%M')
stripe_charge['norm_amount'] = stripe_charge['Amount'].astype(str).str.replace(',', '.').astype(float)
stripe_charge['norm_email'] = stripe_charge['Customer Email'].fillna('').astype(str)
stripe_charge['norm_name'] = stripe_charge.apply(
lambda r: r['Customer Name'] or r['Details'], axis=1
)
# --- load & normalize Raisenow ---
raisenow_import = get_dataframe('raisenow_import')
raisenow = (
raisenow_import
.query("Zahlungsmethode != 'paypal'")
.query("Status == 'succeeded'")
.copy()
)
raisenow['idx_raisenow'] = raisenow.index
raisenow['norm_date'] = pd.to_datetime(raisenow['Erstellt'], format='%Y-%m-%d %H:%M')
raisenow['norm_amount'] = raisenow['Betrag'].astype(float)
raisenow['norm_email'] = raisenow['E-Mail-Adresse'].astype(str)
raisenow['norm_name'] = raisenow['Vorname'].astype(str) + ' ' + raisenow['Nachname'].astype(str)
# start with twostep assignment
raisenow['norm_zweck'] = raisenow.apply(
lambda r: r.get('custom_parameters.altruja_action_name')
or r.get('custom_parameters.altruja_custom1_code'),
axis=1
)
# additional assignment: build a mask of rows where norm_zweck is still empty/NaN
mask = raisenow['norm_zweck'].isna() | (raisenow['norm_zweck'] == '')
raisenow.loc[mask, 'norm_zweck'] = (
raisenow.loc[mask, 'raisenow_parameters.product.source_url']
.str.extract(r'https?://[^/]+/([^/?#]+)')[0]
)
# --- return raw tables if requested ---
if table_name == 'stripe_import':
return stripe_import.dropna(axis=1, how='all')
if table_name == 'raisenow_import':
return raisenow_import.dropna(axis=1, how='all')
# --- 1) Greedy exact same-day matches ---
pairs = []
# index Raisenow rows for fast lookup + dropping
rr = raisenow.set_index('idx_raisenow')
for _, s in stripe_charge.iterrows():
# filter candidates by amount & name
cand = rr[
(rr['norm_amount'] == s['norm_amount']) &
(
(rr['norm_name'] == s['norm_name']) |
(rr['norm_email'] == s['norm_email'])
)
].copy()
if cand.empty:
continue
# compute absolute date difference (days only)
date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
exact_cand = cand[date_diff == pd.Timedelta(0)]
if not exact_cand.empty:
# pick the first exact match
best = exact_cand.index[0]
pairs.append((int(s['idx_stripe']), int(best)))
rr = rr.drop(best)
# --- 2) Greedy fuzzy ±1-day matches on remaining rows ---
used_stripe = {s for s, _ in pairs}
stripe_left = stripe_charge[~stripe_charge['idx_stripe'].isin(used_stripe)].copy()
for _, s in stripe_left.iterrows():
cand = rr[
(rr['norm_amount'] == s['norm_amount']) &
(
(rr['norm_name'] == s['norm_name']) |
(rr['norm_email'] == s['norm_email'])
)
].copy()
if cand.empty:
continue
date_diff = (cand['norm_date'].dt.normalize() - s['norm_date'].normalize()).abs()
cand = cand[date_diff <= pd.Timedelta(days=1)]
if cand.empty:
continue
# pick the one with the smallest gap
best = date_diff.idxmin()
pairs.append((int(s['idx_stripe']), int(best)))
rr = rr.drop(best)
# --- 3) Unique amount & exact-date-only matches ---
# Recompute “leftovers” now after steps 1+2
used_stripe = {s for s, _ in pairs}
stripe_left = stripe_charge[~stripe_charge['idx_stripe'].isin(used_stripe)].copy()
# Prep for grouping
stripe_left['norm_date_norm'] = stripe_left['norm_date'].dt.normalize()
rr_df = rr.reset_index()
rr_df['norm_date_norm'] = rr_df['norm_date'].dt.normalize()
# Count how many per (amount, date) in each
stripe_counts = (
stripe_left
.groupby(['norm_amount','norm_date_norm'])
.size()
.reset_index(name='stripe_count')
)
rr_counts = (
rr_df
.groupby(['norm_amount','norm_date_norm'])
.size()
.reset_index(name='rr_count')
)
# Find the pairs where both counts == 1
unique_keys = pd.merge(stripe_counts, rr_counts,
on=['norm_amount','norm_date_norm'])
unique_keys = unique_keys[
(unique_keys['stripe_count'] == 1) &
(unique_keys['rr_count'] == 1)
]
# Pull those exact singletons through
for _, u in unique_keys.iterrows():
amt = u['norm_amount']
d = u['norm_date_norm']
srow = stripe_left[
(stripe_left['norm_amount'] == amt) &
(stripe_left['norm_date_norm'] == d)
].iloc[0]
rrow = rr_df[
(rr_df['norm_amount'] == amt) &
(rr_df['norm_date_norm'] == d)
].iloc[0]
pairs.append((int(srow['idx_stripe']), int(rrow['idx_raisenow'])))
rr = rr.drop(rrow['idx_raisenow'])
# --- build the merged DataFrame ---
merged_rows = []
for s_idx, r_idx in pairs:
srow = stripe_charge.loc[s_idx].to_dict()
rrow = raisenow.loc[r_idx].to_dict()
# drop any overlapping keys so we never get suffixes
for k in ['norm_amount','norm_name','norm_date','norm_email','idx_stripe']:
rrow.pop(k, None)
# now combine so stripe values win for those keys, and raisenow adds its own columns
merged = {**srow, **rrow}
merged_rows.append(merged)
combined = pd.DataFrame(merged_rows)
starting_columns = ['norm_name', 'norm_date', 'norm_email', 'norm_amount', 'norm_zweck']
# reorder columns to put the most important ones first
combined = pd.concat([
combined[starting_columns],
combined.drop(columns=starting_columns)
], axis=1)
# --- slice out the requested view ---
if table_name == 'merged':
result = combined
elif table_name == 'stripe_only':
used = {s for s, _ in pairs}
result = stripe_charge[~stripe_charge['idx_stripe'].isin(used)]
elif table_name == 'raisenow_only':
used = {r for _, r in pairs}
result = raisenow[~raisenow['idx_raisenow'].isin(used)]
elif table_name == 'export':
used = {s for s, _ in pairs}
stripe_only = stripe_charge[~stripe_charge['idx_stripe'].isin(used)]
result = pd.concat([combined, stripe_only, stripe_adjustment, stripe_stripeFee], ignore_index=True)
# add the Stripe fees to the end of the table
new_rows = [
{'norm_zweck': 'Buchungsgebühren', 'norm_amount': total_stripe_charge_fees * (-1)},
{'norm_zweck': 'Rückbuchungsgebühren', 'norm_amount': total_stripe_refund_fees * (-1)},
{'norm_zweck': 'Korrekturgebühren', 'norm_amount': total_stripe_adjustment_fees * (-1)},
{'norm_zweck': 'Stripe Gebühren', 'norm_amount': total_stripe_stripeFee_fees * (-1)}
]
new_rows_df = pd.DataFrame(new_rows)
result = pd.concat([result, new_rows_df], ignore_index=True)
# fix empty name values
for i, row in result.iterrows():
if pd.isna(row.get('norm_name')) and pd.notna(row.get('Vorname')) and pd.notna(row.get('Nachname')):
result.at[i, 'norm_name'] = f"{row.get('Vorname')} {row.get('Nachname')}".strip()
# fix empty email values
for i, row in result.iterrows():
if (pd.isna(row.get('norm_email')) or row.get('norm_email') == '') and pd.notna(row.get('E-Mail-Adresse')):
result.at[i, 'norm_email'] = f"{row.get('E-Mail-Adresse')}".strip()
else:
raise ValueError(f"Unknown table_name '{table_name}'")
return result.dropna(axis=1, how='all')
@app.route('/')
def index():
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload():
files = request.files.getlist('files')
if not files:
return jsonify({'error': 'No files uploaded'}), 400
for f in files:
raw = (
pd.read_csv(f) if f.filename.lower().endswith('.csv') else pd.read_excel(f)
)
raw = raw.dropna(how='all').dropna(axis=1, how='all')
raw = raw.astype(object).replace({np.nan: None})
cols = list(raw.columns)
if cols[:len(STRIPE_STARTING_COLS)] == STRIPE_STARTING_COLS:
key = 'stripe_import'
dedupe_col = 'ID'
elif cols[:len(RAISENOW_STARTING_COLS)] == RAISENOW_STARTING_COLS:
key = 'raisenow_import'
dedupe_col = 'Identifikationsnummer'
else:
continue
existing = get_dataframe(key)
combined = pd.concat([existing, raw], ignore_index=True)
deduped = combined.drop_duplicates(subset=[dedupe_col], keep='first').reset_index(drop=True)
# Save back to session
session[key] = deduped.astype(object).where(pd.notnull(deduped), None).to_dict(orient='records')
return jsonify({'status': 'ok'})
@app.route('/get_table')
def get_table():
table = request.args.get('table')
df = get_merged_df(table)
df = df.astype(object).where(pd.notnull(df), None)
return jsonify({
'columns': list(df.columns),
'data': df.to_dict(orient='records')
})
@app.route('/download')
def download():
sheets = {
name: get_merged_df(name)
for name in [
'stripe_import',
'raisenow_import',
'merged',
'stripe_only',
'raisenow_only',
'export'
]
}
output = BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
workbook = writer.book
for name, df in sheets.items():
df.to_excel(writer, sheet_name=name, index=False)
worksheet = writer.sheets[name]
# 1) Freeze header row
worksheet.freeze_panes(1, 0)
# 2) Autofilter on the header row across all columns
# (0,0) is the top-left cell; (len(df), len(df.columns)-1) covers all data rows
worksheet.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
# 3) Set column widths to match first-row entries
first_row = df.iloc[0].astype(str)
for col_idx, cell_value in enumerate(first_row):
worksheet.set_column(col_idx, col_idx, len(cell_value) + 2)
output.seek(0)
return send_file(
output,
as_attachment=True,
download_name='all_tables.xlsx',
mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
@app.route('/clear_session', methods=['POST'])
def clear_session():
"""
Clear all session data and reset server-side stored DataFrames.
"""
session.clear()
return jsonify({'status': 'session cleared'})
def export_to_special_format(
df: pd.DataFrame,
reference: str,
account: str,
statement_number: int,
opening_date: datetime,
opening_balance: float,
currency: str,
closing_date: datetime = None,
closing_balance: float = None
) -> str:
"""
Convert a DataFrame of transactions into the special SWIFT-like file format.
Parameters
----------
df : pd.DataFrame
Must contain columns:
- 'value_date' (datetime)
- 'booking_date' (datetime)
- 'dc' (str): 'C' for credit, 'D' for debit
- 'amount' (float)
- optional 'transaction_code' (str)
- optional 'bank_reference' (str)
- 'narrative' (str)
reference : str
Message reference for :20:
account : str
Account number for :25:
statement_number : int
Statement sequence for :28C: (will be zero-padded to 5 digits)
opening_date : datetime
Opening balance date
opening_balance : float
Opening balance amount (positive)
currency : str
Three-letter currency code (e.g. 'EUR')
closing_date : datetime, optional
Closing balance date
closing_balance : float, optional
Closing balance amount (positive)
Returns
-------
str
The formatted file content.
"""
lines = []
# Header
lines.append(f":20:{reference}")
lines.append(f":25:{account}")
lines.append(f":28C:{statement_number:05d}")
# Opening balance :60F:
od = opening_date.strftime('%y%m%d')
ob = f"{opening_balance:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '')
lines.append(f":60F:C{od}{currency}{ob}")
# Transactions
for _, row in df.iterrows():
vd = row['value_date'].strftime('%y%m%d')
bd = row['booking_date'].strftime('%m%d')
dc = row['dc']
amt = f"{row['amount']:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '')
tcode = row.get('transaction_code', '')
bref = row.get('bank_reference', '')
lines.append(f":61:{vd}{bd}{dc}{amt}{tcode}{bref}")
lines.append(f":86:{row['narrative']}")
# Closing balance :62F:
if closing_date and closing_balance is not None:
cd = closing_date.strftime('%y%m%d')
cb = f"{closing_balance:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '')
lines.append(f":62F:C{cd}{currency}{cb}")
return "\n".join(lines)
# Example usage:
# df = pd.DataFrame([...])
# content = export_to_special_format(
# df,
# reference='REFEXCELEXPORT',
# account='11223344/55667788',
# statement_number=0,
# opening_date=datetime(2025,3,6),
# opening_balance=0.00,
# currency='EUR',
# closing_date=datetime(2025,3,6),
# closing_balance=12048.71
# )
# with open('statement.txt', 'w') as f:
# f.write(content)
if __name__ == '__main__':
app.run(debug=True)