segment between languages

2025-06-15 22:56:11 +02:00 · 2025-06-15 22:56:11 +02:00 · fde07e3830
commit fde07e3830
parent a7effaec8f
1 changed files with 13 additions and 14 deletions
--- a/transcribe_all.py
+++ b/transcribe_all.py
@ -147,7 +147,7 @@ def process_file(file_path, model, audio_input):
    # Determine which languages to transcribe
    if (detected == 'ru' and 'predigt' in file_name.lower()) or \
       (detected == 'de' and 'russisch' in file_name.lower()):
-        langs = ['ru', 'de']
+        langs = ['de', 'ru']
    elif detected == 'en':  # songs often mis-detected as English
        langs = ['de']
    elif detected in ('de', 'ru'):
@ -156,17 +156,12 @@ def process_file(file_path, model, audio_input):
        langs = ['ru']

    # Collect segments for combined result
-    combined_segments = []
+    lang_collection = {}
    for lang in langs:
+        combined_segments = []
        print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
        result = transcribe_file(model, audio_input, lang)

-        # Insert a synthetic segment for language header
-        combined_segments.append({
-            'start': 0,
-            'text': f"\n## Transcription ({lang.upper()})",
-        })
-
        # Extend with actual segments
        if isinstance(result, dict) and 'segments' in result:
            combined_segments.extend(result['segments'])
@ -174,6 +169,7 @@ def process_file(file_path, model, audio_input):
            # If result isn't dict-of-segments, wrap entire text
            text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
            combined_segments.append({'start': 0, 'text': text})
+        lang_collection[lang] = combined_segments

    # Now write out markdown using the combined segments
    file_dir = os.path.dirname(file_path)
@ -191,12 +187,15 @@ def process_file(file_path, model, audio_input):
        ""
    ]
    previous_text = ""
-    for segment in combined_segments:
-        start = format_timestamp(segment.get('start', 0))
-        text = segment.get('text', '').strip()
-        if text and text != previous_text:
-            md_lines.append(f"`{start}` {text}")
-            previous_text = text
+    for lang, combined_segments in lang_collection.items():
+        md_lines.append(f"##### Transcription ({lang.upper()})")
+        md_lines.append("---")
+        for segment in combined_segments:
+            start = format_timestamp(segment.get('start', 0))
+            text = segment.get('text', '').strip()
+            if text and text != previous_text:
+                md_lines.append(f"`{start}` {text}")
+                previous_text = text

    # Join and post-process
    transcript_md = "\n".join(md_lines)