diff --git a/transcribe_all.py b/transcribe_all.py index dd01b36..01301b5 100755 --- a/transcribe_all.py +++ b/transcribe_all.py @@ -107,44 +107,7 @@ def print_speed(current_length): trans_speed = 0 print(f" | Speed: {int(trans_speed)} minutes per hour | ", end='', flush=True) - -def write_markdown(file_path, result, postfix=None): - file_dir = os.path.dirname(file_path) - txt_folder = os.path.join(file_dir, "Transkription") - os.makedirs(txt_folder, exist_ok=True) - base_name = os.path.splitext(os.path.basename(file_path))[0] - if postfix != None: - base_name = f"{base_name}_{postfix}" - output_md = os.path.join(txt_folder, base_name + ".md") - - # Prepare the markdown content. - folder_name = os.path.basename(file_dir) - md_lines = [ - f"### {folder_name}", - f"#### {os.path.basename(file_path)}", - "---", - "" - ] - - previous_text = "" - for segment in result["segments"]: - start = format_timestamp(segment["start"]) - text = segment["text"].strip() - if previous_text != text: # suppress repeating lines - md_lines.append(f"`{start}` {text}") - previous_text = text - - transcript_md = "\n".join(md_lines) - - transcript_md = apply_error_correction(transcript_md) - - transcript_md = remove_lines_with_words(transcript_md) - - with open(output_md, "w", encoding="utf-8") as f: - f.write(transcript_md) - - print_speed(result["segments"][-1]["end"]) - print(f"... done !") + def transcribe_file(model, audio_input, language): initial_prompt = ( @@ -170,33 +133,85 @@ def detect_language(model, audio): print(f"{lang_code}. ", end='', flush=True) return lang_code + def process_file(file_path, model, audio_input): + """ + Transcribe the audio file into one markdown file. + If special case (German sermon in Russian or Russian-marked file), transcribe both in Russian and German into the same file. + """ file_name = os.path.basename(file_path) - - # default values - postfix = None - language = detect_language(model, audio_input) - - if language == 'ru' and 'predigt' in file_name.lower() or language == 'de' and 'russisch' in file_name.lower(): # make two files - # first file - language="ru" - postfix = "ru" - print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True) - markdown = transcribe_file(model, audio_input, language) - write_markdown(file_path, markdown, postfix) - # second file - language="de" - postfix = "de" - elif language == 'en': # songs mostly detect as english - language="de" - elif language == 'de' or language == 'ru': # keep as detected - pass - else: # not german not english and not russian. --> russina - language="ru" - - print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True) - markdown = transcribe_file(model, audio_input, language) - write_markdown(file_path, markdown, postfix) + + # Detect spoken language + detected = detect_language(model, audio_input) + + # Determine which languages to transcribe + if (detected == 'ru' and 'predigt' in file_name.lower()) or \ + (detected == 'de' and 'russisch' in file_name.lower()): + langs = ['ru', 'de'] + elif detected == 'en': # songs often mis-detected as English + langs = ['de'] + elif detected in ('de', 'ru'): + langs = [detected] + else: + langs = ['ru'] + + # Collect segments for combined result + combined_segments = [] + for lang in langs: + print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True) + result = transcribe_file(model, audio_input, lang) + + # Insert a synthetic segment for language header + combined_segments.append({ + 'start': 0, + 'text': f"\n## Transcription ({lang.upper()})", + }) + + # Extend with actual segments + if isinstance(result, dict) and 'segments' in result: + combined_segments.extend(result['segments']) + else: + # If result isn't dict-of-segments, wrap entire text + text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result)) + combined_segments.append({'start': 0, 'text': text}) + + # Now write out markdown using the combined segments + file_dir = os.path.dirname(file_path) + txt_folder = os.path.join(file_dir, "Transkription") + os.makedirs(txt_folder, exist_ok=True) + base_name = os.path.splitext(os.path.basename(file_path))[0] + output_md = os.path.join(txt_folder, base_name + ".md") + + # Build markdown lines + folder_name = os.path.basename(file_dir) + md_lines = [ + f"### {folder_name}", + f"#### {os.path.basename(file_path)}", + "---", + "" + ] + previous_text = "" + for segment in combined_segments: + start = format_timestamp(segment.get('start', 0)) + text = segment.get('text', '').strip() + if text and text != previous_text: + md_lines.append(f"`{start}` {text}") + previous_text = text + + # Join and post-process + transcript_md = "\n".join(md_lines) + transcript_md = apply_error_correction(transcript_md) + transcript_md = remove_lines_with_words(transcript_md) + + # Write file and report + with open(output_md, "w", encoding="utf-8") as f: + f.write(transcript_md) + + if combined_segments: + end_ts = combined_segments[-1].get('end', combined_segments[-1].get('start', 0)) + print_speed(end_ts) + print("... done !") + def process_folder(root_folder): """ @@ -225,10 +240,8 @@ def process_folder(root_folder): txt_folder = os.path.join(dirpath, "Transkription") base_name = os.path.splitext(os.path.basename(file_path))[0] output_md = os.path.join(txt_folder, base_name + ".md") - output_md_de = os.path.join(txt_folder, base_name + "_de.md") - output_md_ru = os.path.join(txt_folder, base_name + "_ru.md") # skip files with existing md files - if os.path.exists(output_md) or os.path.exists(output_md_de) or os.path.exists(output_md_ru): + if os.path.exists(output_md): continue valid_files.append(file_path)