first commit

2023-08-01 14:20:25 +02:00
commit e142078e83
17 changed files with 639 additions and 0 deletions
--- a/ds.js
+++ b/ds.js
@@ -0,0 +1,56 @@
+const Sox = require('sox-stream')
+const DeepSpeech = require('deepspeech')
+const MemoryStream = require('memory-stream')
+
+module.exports = emitter => {
+  // Beam width used in the CTC decoder when building candidate transcriptions
+  const BEAM_WIDTH = 500
+  // The alpha hyperparameter of the CTC decoder. Language Model weight
+  const LM_WEIGHT = 1.75
+  // The beta hyperparameter of the CTC decoder. Word insertion weight (penalty)
+  const WORD_COUNT_WEIGHT = 1.00
+  // Valid word insertion weight. This is used to lessen the word insertion penalty
+  // when the inserted word is part of the vocabulary
+  const VALID_WORD_COUNT_WEIGHT = 1.00
+
+  // These constants are tied to the shape of the graph used (changing them changes
+  // the geometry of the first layer), so make sure you use the same constants that
+  // were used during training
+
+  // Number of MFCC features to use
+  const N_FEATURES = 26
+  // Size of the context window used for producing timesteps in the input vector
+  const N_CONTEXT = 9
+
+  const MODEL = './models/output_graph.pb'
+  const ALPHABET = './models/alphabet.txt'
+  const LM = './models/lm.binary'
+  const TRIE = './models/trie'
+
+  console.log('Loading model from file %s', MODEL)
+  let model = new DeepSpeech.Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
+  console.log('Finished loading model')
+  console.log('Loading language model from file(s) %s %s', LM, TRIE)
+  model.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
+  console.log('Finished loading langauge model')
+
+  return function (stream) {
+    let audioStream = new MemoryStream()
+    stream.pipe(Sox({
+      output: {
+        bits: 16,
+        rate: 16000,
+        channels: 1,
+        type: 'raw'
+      }
+    })).pipe(audioStream)
+
+    audioStream.on('finish', () => {
+      let audioBuffer = audioStream.toBuffer()
+      console.log('Running inference...')
+      let text = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000)
+      console.log('Inference finished: %s', String(text))
+      emitter.emit('text', {text})
+    })
+  }
+}