使用AI给视频自动生成中英文字幕

Using AI to create Chinese and English Subtitles

AI
R
Python
Author

Tony D

Published

March 27, 2025

A workflow for generating and embedding bilingual subtitles for videos, covering video/audio downloading, transcription with mlx_whisper, translation with Gemini 2.0 Flash, and embedding with FFmpeg.

This document outlines a complete workflow for automatically generating and embedding bilingual (Chinese and English) subtitles for videos. It covers every step of the process, from downloading the video and audio from YouTube using yt-dlp to transcribing the audio to text with the mlx_whisper model. The guide also demonstrates how to use the Gemini 2.0 Flash model to correct and translate the transcribed text, and finally, how to embed the generated subtitles into the video using FFmpeg. This is a comprehensive resource for anyone looking to make their video content more accessible to a wider audience.

Use mlx_whisper for transcribing audio to text, and use gemini-2.0-flash for correction

Load R packages

Code
#pak::pkg_install('tuneR')
library(ellmer)
library(tidyverse)
library(srt)
library(openxlsx)
library(readxl)
library(lares)
library(tuneR)
library(stringr)

Step 1: download mp3 from youtube

Code
youtube_url='https://www.youtube.com/watch?v=eZrzJtdUntg'


#title=system(paste0("yt-dlp --simulate --print '%(title)s' ",youtube_url),intern = TRUE)
#title=paste0(title,'.mp3')
#title
Code
download_command_mp3=paste0("yt-dlp --cookies-from-browser chrome -x --audio-format mp3 --audio-quality 0 '",youtube_url,"' -o '%(title)s'")

download_command_mp3

download_command_video=paste0("yt-dlp --cookies-from-browser chrome '",youtube_url,"'")

download_command_video
Code
system(download_command_mp3)
Code
system(download_command_video)
Code
list.files()
Code
mp3_title=list.files() |> str_subset(pattern = ".mp3")
mp3_title

check mp3 duration

Code
# Load the MP3 file
# Load the MP3 file
mp3_file <- readMP3(mp3_title)

# Get the duration in seconds
duration_mins <- (length(mp3_file@left) / mp3_file@samp.rate)/60
duration_mins

trim mp3 if needed

Code
# library(lares)
# trim_mp3(
#   mp3_title,
#   start_time = 1,
#   end_time = 9999999,
#   overwrite = FALSE,
#   ext = "mp3",
#   quiet = FALSE
# )


# output file:
#paste0(mp3_title |> str_replace('.mp3,',''),"_trim.mp3")

Step 2: using mlx_whisper model to get text from mp4

run audio_txt.py

import mlx_whisper
import argparse
from whisper.utils import get_writer


# Create the parser
parser = argparse.ArgumentParser(description="A simple example using argparse")


parser.add_argument('-n', '--name', type=str, help='Your name', required=True)

# Parse the arguments
args = parser.parse_args()

# Access the arguments
print(f"Hello, {args.name}!")




speech_file= args.name
# Using mlx-community/whisper-large-v3-turbo model
result = mlx_whisper.transcribe(speech_file, 
                                path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
                                word_timestamps=True
                                )

srt_writer = get_writer("srt",'.')
srt_writer(result,'text.srt')

srt_writer = get_writer("txt",'.')
srt_writer(result,'text.txt')



Code
file_name=r.mp3_title
Code
import os 
comand="python3.11 audio_txt.py -n='"+file_name+"'"
comand
os.system(comand)

output is text.txt and text.srt

Step 3: Using gemini model to correct word

model gemini-2.5-pro-exp-03-25 or gemini-2.0-flash

define model

Code
chat_gemini_model<- chat_gemini(
  system_prompt = "你是一个中文和英文的语言学家",
  turns = NULL,
  # base_url = "https://generativelanguage.googleapis.com/v1beta",
  api_key = keyring::key_get("google_ai_api_key"),
  model = "gemini-2.0-flash",
  #api_args = list(),
  #echo = NULL
)
chat_gemini_model
Code
#testing model connection
chat_result=chat_gemini_model$chat("hello")
chat_result

Run model

Code
srt_txt0=read_srt('text.srt')
srt_txt2=srt_txt0$subtitle|> as.character()
Code
length(srt_txt2)
Code
prompt_text=paste0('把以下文字是通过语言识别出来的文字。如果有错别字请更正并输出中文。保持更正后的文字与原文的文字长度一样。也保持句子总长度与更正后的句子总长度一致。比如hovah请更正为福建人。没有错则不变。有更正的句子后面加上!!!!。不要多余的反馈。输出格式为:更正前的句子《---》更正后的句子 ',srt_txt2)
chat_result1=chat_gemini_model$chat(prompt_text)
Code
all_result2=unlist(strsplit(chat_result1, "\n"))
length(all_result2)
#all_result2= c(all_result2,"")

add to data

Code
srt_txt=srt_txt0 |> mutate(correct_txt=all_result2 |> str_replace('!!!!','')|> str_extract( "(?<=《---》).*")
                           ,all_correct_txt=all_result2
                           
                           )

Step 4: Translate to English

define model

中翻英 using google LLM model gemini-2.0-flash

Code
chat_gemini_model_translate<- chat_gemini(
  system_prompt = "你是一个中文和英文的翻译专家",
  turns = NULL,
  # base_url = "https://generativelanguage.googleapis.com/v1beta",
  api_key = keyring::key_get("google_ai_api_key"),
  model = "gemini-2.0-flash",
  #api_args = list(),
  #echo = NULL
)
chat_gemini_model_translate

run model

Code
correct_txt=srt_txt$correct_txt|> as.character()
Code
prompt_text=paste0('请联系上下文把以下文字翻译成英文。总句子数量不变。不要多余的反馈。输出格式为:原来的文字《---》翻译成英文',correct_txt)
chat_result1=chat_gemini_model_translate$chat(prompt_text)
Code
all_result2=unlist(strsplit(chat_result1, "\n"))
length(all_result2)
#all_result2=all_result2[1:422]

add to data

Code
srt_txt=srt_txt |> mutate(correct_english_txt=all_result2 |> str_extract( "(?<=《---》).*"))

Step 5:output chinese and english subtites to srt

Code
cn_correct_srt_txt=srt_txt |> select(n,start,end,subtitle=correct_txt)
en_correct_srt_txt=srt_txt |> select(n,start,end,subtitle=correct_english_txt)

srt::write_srt(cn_correct_srt_txt,"corrected_cn_srt2.srt",wrap = FALSE)
srt::write_srt(en_correct_srt_txt,"corrected_en_srt2.srt",wrap = FALSE)

output srt_txt

Code
write.xlsx(srt_txt,'srt_data.xlsx')

Step 6:embed srt to mp4 using ffmpeg

Code
# Specify input and output file paths
input_video <- "input.mp4"
subtitle_file <- "corrected_cn_srt2.srt"
output_video <- "output.mp4"

# Use ffmpeg to add subtitles
ffmpeg_command <- paste0(
  "ffmpeg -i \"", input_video, "\"",
  " -vf \"subtitles=", subtitle_file, ":force_style='Fontsize=20'\"",
  " -c:a copy -c:v libx264 -crf 23 -preset veryfast \"", output_video, "\""
)
# Execute the command
system(ffmpeg_command)