Video/Audio Transcription
Convert video and audio content to text using GAIK toolkit
Overview
This example demonstrates how to build a transcription system that converts video and audio files into text format. The transcribed text can then be processed using other GAIK toolkit components for knowledge extraction and analysis.
Use Case
Scenario: You need to make video and audio content searchable, accessible, and analyzable.
Solution: Build a transcription pipeline that converts media files to text and structures the content for further processing.
Benefits:
- Make content searchable
- Generate subtitles and captions
- Enable content analysis
- Improve accessibility
- Create searchable archives
Implementation
Step 1: Setup
Install required components and dependencies:
import { Parser, KnowledgeCapture } from 'gaik-toolkit';
// For this example, we'll assume you have a transcription service
// You can use services like OpenAI Whisper, Google Speech-to-Text, etc.Step 2: Audio Extraction
Extract audio from video files:
class MediaProcessor {
async extractAudio(videoPath) {
// Extract audio track from video
// This would typically use ffmpeg or similar
console.log(`Extracting audio from ${videoPath}`);
return {
audioPath: videoPath.replace(/\.[^.]+$/, '.wav'),
format: 'wav',
duration: 0, // Get from metadata
channels: 2,
sampleRate: 44100
};
}
async getMediaInfo(filePath) {
// Get media file information
return {
duration: 300, // seconds
format: 'video/mp4',
hasAudio: true,
hasVideo: true
};
}
}Step 3: Transcription Engine
Create the transcription functionality:
class TranscriptionEngine {
constructor(apiKey) {
this.apiKey = apiKey;
// Initialize your transcription service
}
async transcribe(audioPath, options = {}) {
const {
language = 'en',
timestamps = true,
speakers = false
} = options;
console.log(`Transcribing ${audioPath}...`);
// Call transcription service
// This is a placeholder - integrate with actual service
const transcript = await this.callTranscriptionService(
audioPath,
{ language, timestamps, speakers }
);
return transcript;
}
async callTranscriptionService(audioPath, options) {
// Placeholder for actual API call
// In production, use Whisper, Google Speech-to-Text, etc.
return {
text: 'Sample transcription text...',
segments: [
{
start: 0,
end: 5.2,
text: 'Welcome to this video.',
confidence: 0.98
},
{
start: 5.2,
end: 10.5,
text: 'Today we will discuss...',
confidence: 0.95
}
],
language: options.language,
duration: 300
};
}
async transcribeWithSpeakers(audioPath) {
// Transcribe with speaker identification
return await this.transcribe(audioPath, {
speakers: true
});
}
}Step 4: Build Complete Transcription System
Combine all components:
class VideoTranscriptionSystem {
constructor(apiKey) {
this.processor = new MediaProcessor();
this.transcriber = new TranscriptionEngine(apiKey);
this.parser = new Parser();
this.capture = new KnowledgeCapture();
}
async transcribeVideo(videoPath, options = {}) {
try {
// Get media info
const info = await this.processor.getMediaInfo(videoPath);
// Extract audio if it's a video
let audioPath = videoPath;
if (info.hasVideo) {
const audioInfo = await this.processor.extractAudio(videoPath);
audioPath = audioInfo.audioPath;
}
// Transcribe
const transcript = await this.transcriber.transcribe(
audioPath,
options
);
// Process with GAIK
const knowledge = await this.processTranscript(transcript);
return {
source: videoPath,
transcript: transcript,
knowledge: knowledge,
metadata: info,
processedAt: new Date().toISOString()
};
} catch (error) {
console.error('Transcription failed:', error);
throw error;
}
}
async processTranscript(transcript) {
// Use Knowledge Capture to extract entities and concepts
const knowledge = await this.capture.captureFromText(
transcript.text
);
return {
...knowledge,
segments: transcript.segments,
summary: this.generateSummary(transcript.segments)
};
}
generateSummary(segments) {
// Generate a summary from segments
const sentences = segments.map(s => s.text).slice(0, 5);
return sentences.join(' ');
}
async transcribeBatch(filePaths, options = {}) {
const results = await Promise.all(
filePaths.map(path =>
this.transcribeVideo(path, options)
)
);
return results;
}
}Complete Example
Here's how to use the transcription system:
// Initialize system
const apiKey = 'your-transcription-api-key';
const transcription = new VideoTranscriptionSystem(apiKey);
// Transcribe single video
async function transcribeSingleVideo() {
const result = await transcription.transcribeVideo(
'path/to/video.mp4',
{
language: 'en',
timestamps: true,
speakers: false
}
);
console.log('Transcript:', result.transcript.text);
console.log('Entities:', result.knowledge.entities);
console.log('Concepts:', result.knowledge.concepts);
return result;
}
// Transcribe multiple videos
async function transcribeMultipleVideos() {
const videos = [
'video1.mp4',
'video2.mp4',
'video3.mp4'
];
const results = await transcription.transcribeBatch(videos);
results.forEach((result, index) => {
console.log(`Video ${index + 1}:`);
console.log('Duration:', result.metadata.duration);
console.log('Summary:', result.knowledge.summary);
});
return results;
}
// Run
transcribeSingleVideo();Advanced Features
Subtitle Generation
Generate subtitle files from transcription:
class SubtitleGenerator {
generateSRT(transcript) {
let srt = '';
transcript.segments.forEach((segment, index) => {
srt += `${index + 1}\n`;
srt += `${this.formatTime(segment.start)} --> ${this.formatTime(segment.end)}\n`;
srt += `${segment.text}\n\n`;
});
return srt;
}
formatTime(seconds) {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
const ms = Math.floor((seconds % 1) * 1000);
return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(secs).padStart(2, '0')},${String(ms).padStart(3, '0')}`;
}
generateVTT(transcript) {
let vtt = 'WEBVTT\n\n';
transcript.segments.forEach((segment, index) => {
vtt += `${index + 1}\n`;
vtt += `${this.formatTimeVTT(segment.start)} --> ${this.formatTimeVTT(segment.end)}\n`;
vtt += `${segment.text}\n\n`;
});
return vtt;
}
formatTimeVTT(seconds) {
const time = this.formatTime(seconds);
return time.replace(',', '.');
}
}
// Usage
const subtitleGen = new SubtitleGenerator();
const srt = subtitleGen.generateSRT(transcript);
const vtt = subtitleGen.generateVTT(transcript);Speaker Diarization
Identify and separate different speakers:
class VideoTranscriptionSystem {
// ... previous methods ...
async transcribeWithSpeakers(videoPath) {
const result = await this.transcribeVideo(videoPath, {
speakers: true
});
// Group by speaker
const bySpeaker = this.groupBySpeaker(result.transcript.segments);
return {
...result,
speakers: bySpeaker
};
}
groupBySpeaker(segments) {
const speakers = {};
segments.forEach(segment => {
const speaker = segment.speaker || 'Unknown';
if (!speakers[speaker]) {
speakers[speaker] = {
segments: [],
totalDuration: 0,
wordCount: 0
};
}
speakers[speaker].segments.push(segment);
speakers[speaker].totalDuration += segment.end - segment.start;
speakers[speaker].wordCount += segment.text.split(' ').length;
});
return speakers;
}
}Integration with Knowledge Extraction
Extract insights from transcripts:
import { KnowledgeExtraction } from 'gaik-toolkit';
class TranscriptAnalyzer {
constructor() {
this.extractor = new KnowledgeExtraction({
queryMode: 'semantic'
});
}
async analyze(transcriptionResult) {
const knowledge = transcriptionResult.knowledge;
// Extract key topics
const topics = await this.extractor.extract(
knowledge,
'What are the main topics discussed?'
);
// Extract action items
const actions = await this.extractor.extract(
knowledge,
'List any action items or tasks mentioned'
);
// Extract people mentioned
const people = await this.extractor.extract(
knowledge,
'Who are the people mentioned?'
);
return {
topics: topics.results,
actions: actions.results,
people: people.results,
summary: knowledge.summary
};
}
}
// Usage
const analyzer = new TranscriptAnalyzer();
const analysis = await analyzer.analyze(transcriptionResult);
console.log('Key Topics:', analysis.topics);
console.log('Action Items:', analysis.actions);Real-time Transcription
For live streaming scenarios:
class RealtimeTranscriber {
constructor(apiKey) {
this.transcriber = new TranscriptionEngine(apiKey);
this.buffer = [];
this.callbacks = [];
}
startStream(audioStream) {
audioStream.on('data', chunk => {
this.buffer.push(chunk);
if (this.buffer.length >= 10) { // Process every 10 chunks
this.processBuffer();
}
});
}
async processBuffer() {
const audio = Buffer.concat(this.buffer);
this.buffer = [];
try {
const result = await this.transcriber.transcribe(audio);
// Notify listeners
this.callbacks.forEach(cb => cb(result));
} catch (error) {
console.error('Real-time transcription error:', error);
}
}
onTranscript(callback) {
this.callbacks.push(callback);
}
}
// Usage
const realtime = new RealtimeTranscriber(apiKey);
realtime.onTranscript(transcript => {
console.log('Live:', transcript.text);
});
realtime.startStream(audioStream);Batch Processing Example
Process large numbers of files efficiently:
class BatchProcessor {
constructor(transcriptionSystem) {
this.system = transcriptionSystem;
this.queue = [];
this.processing = false;
this.maxConcurrent = 3;
}
async addToQueue(filePath, options = {}) {
this.queue.push({ filePath, options });
if (!this.processing) {
this.processQueue();
}
}
async processQueue() {
this.processing = true;
while (this.queue.length > 0) {
const batch = this.queue.splice(0, this.maxConcurrent);
await Promise.all(
batch.map(item =>
this.processFile(item.filePath, item.options)
)
);
}
this.processing = false;
}
async processFile(filePath, options) {
try {
console.log(`Processing: ${filePath}`);
const result = await this.system.transcribeVideo(
filePath,
options
);
await this.saveResult(result);
console.log(`Completed: ${filePath}`);
} catch (error) {
console.error(`Failed: ${filePath}`, error);
}
}
async saveResult(result) {
// Save to database or file system
const filename = `transcript_${Date.now()}.json`;
// Implementation depends on your storage
}
}Output Format
Example transcription output:
{
"source": "video.mp4",
"transcript": {
"text": "Complete transcription text...",
"segments": [
{
"start": 0,
"end": 5.2,
"text": "Welcome to this video.",
"confidence": 0.98,
"speaker": "Speaker 1"
}
],
"language": "en",
"duration": 300
},
"knowledge": {
"entities": [...],
"concepts": [...],
"summary": "This video discusses..."
},
"metadata": {
"duration": 300,
"format": "video/mp4"
},
"processedAt": "2024-03-15T10:30:00Z"
}Next Steps
- Integrate with Semantic Video Search for searchable transcripts
- Use Knowledge Extraction to query transcripts
- Add support for multiple languages
- Implement quality improvement with custom vocabulary