From fde07e3830a74cc6276369ba6088be02cf8062dc Mon Sep 17 00:00:00 2001
From: lelo <leonid.firus@outlook.com>
Date: Sun, 15 Jun 2025 22:56:11 +0200
Subject: [PATCH] segment between languages

---
 transcribe_all.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/transcribe_all.py b/transcribe_all.py
index 01301b5..bfea85a 100755
--- a/transcribe_all.py
+++ b/transcribe_all.py
@@ -147,7 +147,7 @@ def process_file(file_path, model, audio_input):
     # Determine which languages to transcribe
     if (detected == 'ru' and 'predigt' in file_name.lower()) or \
        (detected == 'de' and 'russisch' in file_name.lower()):
-        langs = ['ru', 'de']
+        langs = ['de', 'ru']
     elif detected == 'en':  # songs often mis-detected as English
         langs = ['de']
     elif detected in ('de', 'ru'):
@@ -156,17 +156,12 @@ def process_file(file_path, model, audio_input):
         langs = ['ru']
 
     # Collect segments for combined result
-    combined_segments = []
+    lang_collection = {}
     for lang in langs:
+        combined_segments = []
         print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
         result = transcribe_file(model, audio_input, lang)
 
-        # Insert a synthetic segment for language header
-        combined_segments.append({
-            'start': 0,
-            'text': f"\n## Transcription ({lang.upper()})",
-        })
-
         # Extend with actual segments
         if isinstance(result, dict) and 'segments' in result:
             combined_segments.extend(result['segments'])
@@ -174,6 +169,7 @@ def process_file(file_path, model, audio_input):
             # If result isn't dict-of-segments, wrap entire text
             text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
             combined_segments.append({'start': 0, 'text': text})
+        lang_collection[lang] = combined_segments
 
     # Now write out markdown using the combined segments
     file_dir = os.path.dirname(file_path)
@@ -191,12 +187,15 @@ def process_file(file_path, model, audio_input):
         ""
     ]
     previous_text = ""
-    for segment in combined_segments:
-        start = format_timestamp(segment.get('start', 0))
-        text = segment.get('text', '').strip()
-        if text and text != previous_text:
-            md_lines.append(f"`{start}` {text}")
-            previous_text = text
+    for lang, combined_segments in lang_collection.items():
+        md_lines.append(f"##### Transcription ({lang.upper()})")
+        md_lines.append("---")
+        for segment in combined_segments:
+            start = format_timestamp(segment.get('start', 0))
+            text = segment.get('text', '').strip()
+            if text and text != previous_text:
+                md_lines.append(f"`{start}` {text}")
+                previous_text = text
 
     # Join and post-process
     transcript_md = "\n".join(md_lines)