使用AI给播客语音转文字并作摘要

Using AI to create Summary for podcast

AI
R
Python
Author

Tony Duan

Published

March 28, 2025

Article abstract for podcast like firstory/poddtoppen/小宇宙FM

Code
#pak::pkg_install('tuneR')
library(ellmer)
library(tidyverse)
library(srt)
library(openxlsx)
library(readxl)
library(lares)
library(tuneR)
library(stringr)
library(rvest)
library(av)

Step 1: download mp3

check mp3 duration

Code
# Load the MP3 file
mp3_file <- readMP3("output.mp3" )

# Get the duration in seconds
duration_mins <- (length(mp3_file@left) / mp3_file@samp.rate)/60
duration_mins

trim mp3 if needed

Code
trim_mp3(
  'output.mp3',
  start_time = 1,
  end_time = 600,
  overwrite = FALSE,
  ext = "mp3",
  quiet = FALSE
)

# # output file:output_trim.mp3
Code
# Load the MP3 file
mp3_file <- readMP3("output_trimmed.mp3" ) 
# Get the duration in seconds
(length(mp3_file@left) / mp3_file@samp.rate)/60

check mp3 duration

Code
# Load the MP3 file
mp3_file <- readMP3("output.mp3" )

# Get the duration in seconds
duration_mins <- (length(mp3_file@left) / mp3_file@samp.rate)/60
duration_mins

trim mp3 if needed

Code
# trim_mp3(
#   'output.mp3',
#   start_time = 1,
#   end_time = 600,
#   overwrite = FALSE,
#   ext = "mp3",
#   quiet = FALSE
# )

# # output file:output_trim.mp3
Code
# Load the MP3 file
mp3_file <- readMP3("output_trimmed.mp3" ) 
# Get the duration in seconds
(length(mp3_file@left) / mp3_file@samp.rate)/60

convert m4a to mp3

Code
av_audio_convert('output.m4a', 'output.mp3')
Code
list.files()

check mp3 duration

Code
# Load the MP3 file
mp3_file <- readMP3('output.mp3')

# Get the duration in seconds
duration_mins <- (length(mp3_file@left) / mp3_file@samp.rate)/60
duration_mins

trim mp3 if needed

Code
# trim_mp3(
#   'output.mp3',
#   start_time = 1,
#   end_time = 600,
#   overwrite = FALSE,
#   ext = "mp3",
#   quiet = FALSE
# )

# # output file:output_trim.mp3
Code
# # Load the MP3 file
# mp3_file <- readMP3("output_trimmed.mp3" ) 
# # Get the duration in seconds
# (length(mp3_file@left) / mp3_file@samp.rate)/60

Step 2: using mlx_whisper model to get text from audio(mp3,m4a)

run audio_txt.py

import mlx_whisper
from whisper.utils import get_writer
speech_file="output.mp3"
# Using mlx-community/whisper-large-v3-turbo model
result = mlx_whisper.transcribe(speech_file, 
                                path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
                                word_timestamps=True
                                )

srt_writer = get_writer("srt",'.')
srt_writer(result,'text.srt')

srt_writer = get_writer("txt",'.')
srt_writer(result,'text.txt')



Code
import os 
os.system("python3.11 audio_txt.py")

output is text.txt and text.srt

Step 3: Using gemini model to summary

model gemini-2.5-pro-exp-03-25 or gemini-2.0-flash

define model

Code
chat_gemini_model<- chat_gemini(
  system_prompt = "你是一个中文,英文,威士忌专家",
  turns = NULL,
  # base_url = "https://generativelanguage.googleapis.com/v1beta",
  api_key = keyring::key_get("google_ai_api_key"),
  model = "gemini-2.0-flash",
  #api_args = list(),
  #echo = NULL
)
chat_gemini_model
Code
#testing model connection
chat_result=chat_gemini_model$chat("hello")
chat_result

Run model

Code
srt_txt0=read_srt('text.srt')
srt_txt2=srt_txt0$subtitle|> as.character()
Code
length(srt_txt2)
Code
prompt_text=paste0('请给以下文字作500字内摘要:',srt_txt2)
summary_text=chat_gemini_model$chat(prompt_text)
Code
summary_text |> tibble() |> write_delim('summary.txt')

Step 4: correct summary

define model

Code
chat_gemini_model<- chat_gemini(
  system_prompt = "你是一个中文和英文的威士忌专家",
  turns = NULL,
  # base_url = "https://generativelanguage.googleapis.com/v1beta",
  api_key = keyring::key_get("google_ai_api_key"),
  model = "gemini-2.0-flash",
  #api_args = list(),
  #echo = NULL
)
chat_gemini_model

Run model

Code
prompt_text=paste0('请更正以下文字的错别字,并且改正胡云为壶云,希游记为嬉游忌,Wish Jokey为WhisJockey,不要空白行',summary_text)
correct_summary_text=chat_gemini_model$chat(prompt_text)
Code
correct_summary_text |> str_replace_all('\n\n','\n')|> tibble() |> write_delim('correct_summary2.txt')

Step 5: summary str text

define model

Code
chat_gemini_model<- chat_gemini(
  system_prompt = "你是一个中文和英文的威士忌专家",
  turns = NULL,
  # base_url = "https://generativelanguage.googleapis.com/v1beta",
  api_key = keyring::key_get("google_ai_api_key"),
  model = "gemini-2.0-flash",
  #model = "gemini-2.5-pro-exp-03-25",
  #api_args = list(),
  #echo = NULL
)
chat_gemini_model

Run model

Code
srt_txt_format=read.delim('text.srt')
Code
prompt_text=paste0('下面的内容是srt文档。请按每5分钟做一个摘要,再更正以下文字,胡云为壶云,希游记为嬉游忌,Wish Jokey为WhisJockey',srt_txt_format)
correct_summary_text=chat_gemini_model$chat(prompt_text)
Code
correct_summary_text|> tibble() |> write_delim('correct_srt_summary.txt')