File size: 3,827 Bytes
5e456c0
 
 
 
 
 
 
 
 
 
 
 
 
 
19d7f69
 
 
 
6eea7b7
 
 
 
5e456c0
 
 
 
 
 
 
 
728fcbe
 
 
5e456c0
 
 
 
6eea7b7
5e456c0
ccaa693
728fcbe
 
5e456c0
1830e27
728fcbe
5b36f0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e456c0
 
 
 
6eea7b7
5e456c0
 
 
 
728fcbe
 
 
 
 
 
 
 
 
89e8314
4a56623
89e8314
 
4a56623
 
5e456c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38f9f3b
 
 
 
5e456c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package whisper

import (
	"io"
	"time"
)

///////////////////////////////////////////////////////////////////////////////
// TYPES

// SegmentCallback is the callback function for processing segments in real
// time. It is called during the Process function
type SegmentCallback func(Segment)

// ProgressCallback is the callback function for reporting progress during
// processing. It is called during the Process function
type ProgressCallback func(int)

// EncoderBeginCallback is the callback function for checking if we want to
// continue processing. It is called during the Process function
type EncoderBeginCallback func() bool

// Model is the interface to a whisper model. Create a new model with the
// function whisper.New(string)
type Model interface {
	io.Closer

	// Return a new speech-to-text context.
	NewContext() (Context, error)

	// Return true if the model is multilingual.
	IsMultilingual() bool

	// Return all languages supported.
	Languages() []string
}

// Context is the speech recognition context.
type Context interface {
	SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
	SetTranslate(bool)        // Set translate flag
	IsMultilingual() bool     // Return true if the model is multilingual.
	Language() string         // Get language
	DetectedLanguage() string // Get detected language

	SetOffset(time.Duration)          // Set offset
	SetDuration(time.Duration)        // Set duration
	SetThreads(uint)                  // Set number of threads to use
	SetSplitOnWord(bool)              // Set split on word flag
	SetTokenThreshold(float32)        // Set timestamp token probability threshold
	SetTokenSumThreshold(float32)     // Set timestamp token sum probability threshold
	SetMaxSegmentLength(uint)         // Set max segment length in characters
	SetTokenTimestamps(bool)          // Set token timestamps flag
	SetMaxTokensPerSegment(uint)      // Set max tokens per segment (0 = no limit)
	SetAudioCtx(uint)                 // Set audio encoder context
	SetMaxContext(n int)              // Set maximum number of text context tokens to store
	SetBeamSize(n int)                // Set Beam Size
	SetEntropyThold(t float32)        // Set Entropy threshold
	SetInitialPrompt(prompt string)   // Set initial prompt
	SetTemperature(t float32)         // Set temperature
	SetTemperatureFallback(t float32) // Set temperature incrementation

	// Process mono audio data and return any errors.
	// If defined, newly generated segments are passed to the
	// callback function during processing.
	Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error

	// After process is called, return segments until the end of the stream
	// is reached, when io.EOF is returned.
	NextSegment() (Segment, error)

	IsBEG(Token) bool          // Test for "begin" token
	IsSOT(Token) bool          // Test for "start of transcription" token
	IsEOT(Token) bool          // Test for "end of transcription" token
	IsPREV(Token) bool         // Test for "start of prev" token
	IsSOLM(Token) bool         // Test for "start of lm" token
	IsNOT(Token) bool          // Test for "No timestamps" token
	IsLANG(Token, string) bool // Test for token associated with a specific language
	IsText(Token) bool         // Test for text token

	// Timings
	PrintTimings()
	ResetTimings()

	SystemInfo() string
}

// Segment is the text result of a speech recognition.
type Segment struct {
	// Segment Number
	Num int

	// Time beginning and end timestamps for the segment.
	Start, End time.Duration

	// The text of the segment.
	Text string

	// The tokens of the segment.
	Tokens []Token
}

// Token is a text or special token
type Token struct {
	Id         int
	Text       string
	P          float32
	Start, End time.Duration
}