1 changed files with 66 additions and 79 deletions
--- a/transcribe_all.py
+++ b/transcribe_all.py
@ -107,7 +107,44 @@ def print_speed(current_length):
        trans_speed = 0
    print(f" | Speed: {int(trans_speed)} minutes per hour | ", end='', flush=True)
-
+    
 def write_markdown(file_path, result, postfix=None):
    file_dir = os.path.dirname(file_path)
    txt_folder = os.path.join(file_dir, "Transkription")
    os.makedirs(txt_folder, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if postfix != None:
        base_name = f"{base_name}_{postfix}"
    output_md = os.path.join(txt_folder, base_name + ".md")
    # Prepare the markdown content.
    folder_name = os.path.basename(file_dir)
    md_lines = [
        f"### {folder_name}",
        f"#### {os.path.basename(file_path)}",
        "---",
        ""
    ]
    previous_text = ""
    for segment in result["segments"]:
        start = format_timestamp(segment["start"])
        text = segment["text"].strip()
        if previous_text != text: # suppress repeating lines
            md_lines.append(f"`{start}` {text}")
        previous_text = text
    transcript_md = "\n".join(md_lines)
    transcript_md = apply_error_correction(transcript_md)
    transcript_md = remove_lines_with_words(transcript_md)
    with open(output_md, "w", encoding="utf-8") as f:
        f.write(transcript_md)
    print_speed(result["segments"][-1]["end"])
    print(f"... done !")
 def transcribe_file(model, audio_input, language):
    initial_prompt = (
@ -133,85 +170,33 @@ def detect_language(model, audio):
    print(f"{lang_code}. ", end='', flush=True)
    return lang_code
 def process_file(file_path, model, audio_input):
    """
    Transcribe the audio file into one markdown file.
    If special case (German sermon in Russian or Russian-marked file), transcribe both in Russian and German into the same file.
    """
    file_name = os.path.basename(file_path)
-
+    
-    # Detect spoken language
+    # default values
-    detected = detect_language(model, audio_input)
+    postfix = None
-
+    language = detect_language(model, audio_input)
-    # Determine which languages to transcribe
+        
-    if (detected == 'ru' and 'predigt' in file_name.lower()) or \
+    if language == 'ru' and 'predigt' in file_name.lower() or language == 'de' and 'russisch' in file_name.lower(): # make two files
-       (detected == 'de' and 'russisch' in file_name.lower()):
+        # first file
-        langs = ['ru', 'de']
+        language="ru"
-    elif detected == 'en':  # songs often mis-detected as English
+        postfix = "ru"
-        langs = ['de']
+        print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)
-    elif detected in ('de', 'ru'):
+        markdown = transcribe_file(model, audio_input, language)
-        langs = [detected]
+        write_markdown(file_path, markdown, postfix)
-    else:
+        # second file
-        langs = ['ru']
+        language="de"
-
+        postfix = "de"
-    # Collect segments for combined result
+    elif language == 'en': # songs mostly detect as english
-    combined_segments = []
+        language="de"
-    for lang in langs:
+    elif language == 'de' or language == 'ru': # keep as detected
-        print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
+        pass
-        result = transcribe_file(model, audio_input, lang)
+    else: # not german not english and not russian. --> russina
-
+        language="ru"
-        # Insert a synthetic segment for language header
+            
-        combined_segments.append({
+    print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)       
-            'start': 0,
+    markdown = transcribe_file(model, audio_input, language)
-            'text': f"\n## Transcription ({lang.upper()})",
+    write_markdown(file_path, markdown, postfix)
        })
        # Extend with actual segments
        if isinstance(result, dict) and 'segments' in result:
            combined_segments.extend(result['segments'])
        else:
            # If result isn't dict-of-segments, wrap entire text
            text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
            combined_segments.append({'start': 0, 'text': text})
    # Now write out markdown using the combined segments
    file_dir = os.path.dirname(file_path)
    txt_folder = os.path.join(file_dir, "Transkription")
    os.makedirs(txt_folder, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_md = os.path.join(txt_folder, base_name + ".md")
    # Build markdown lines
    folder_name = os.path.basename(file_dir)
    md_lines = [
        f"### {folder_name}",
        f"#### {os.path.basename(file_path)}",
        "---",
        ""
    ]
    previous_text = ""
    for segment in combined_segments:
        start = format_timestamp(segment.get('start', 0))
        text = segment.get('text', '').strip()
        if text and text != previous_text:
            md_lines.append(f"`{start}` {text}")
            previous_text = text
    # Join and post-process
    transcript_md = "\n".join(md_lines)
    transcript_md = apply_error_correction(transcript_md)
    transcript_md = remove_lines_with_words(transcript_md)
    # Write file and report
    with open(output_md, "w", encoding="utf-8") as f:
        f.write(transcript_md)
    if combined_segments:
        end_ts = combined_segments[-1].get('end', combined_segments[-1].get('start', 0))
        print_speed(end_ts)
    print("... done !")
 def process_folder(root_folder):
    """
@ -240,8 +225,10 @@ def process_folder(root_folder):
                txt_folder = os.path.join(dirpath, "Transkription")
                base_name = os.path.splitext(os.path.basename(file_path))[0]
                output_md = os.path.join(txt_folder, base_name + ".md")
                output_md_de = os.path.join(txt_folder, base_name + "_de.md")
                output_md_ru = os.path.join(txt_folder, base_name + "_ru.md")
                # skip files with existing md files
-                if os.path.exists(output_md):
+                if os.path.exists(output_md) or os.path.exists(output_md_de) or os.path.exists(output_md_ru):
                    continue
                valid_files.append(file_path)