first commit

2023-08-01 14:20:25 +02:00
commit e142078e83
17 changed files with 639 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,65 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# TypeScript v1 declaration files
+typings/
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variables file
+.env
+
+# next.js build output
+.next
+
+models/
+models.tar.gz
+package-lock.json
--- a/README.md
+++ b/README.md
@@ -0,0 +1,51 @@
+# DeepSpeech Demo
+
+![Screenshot](public/screenshot.png)
+
+## About
+
+This is a simple web-interface and Node server that uses [DeepSpeech](https://github.com/mozilla/DeepSpeech) to create a local Speech-to-Text service.
+
+I mostly built it for myself, in order to play around with DeepSpeech, but it can be used
+as a starting point for your own experiments.
+
+## Prerequisites
+
+The demo relies on [SoX](http://sox.sourceforge.net/) being available
+on your system and in your PATH.
+
+A quick seach for `how to install SoX on (windows|ubuntu|mac)` should do the trick
+for you if you don't know how to install it, at least it did for me.
+
+## Install
+
+First, clone the repository:
+
+```bash
+git clone git@github.com:asciidisco/deepspeech-demo.git
+```
+
+Then change into the directory & perform an npm install:
+
+```bash
+npm install
+````
+
+As this downloads the pre-trained models from the DeepSpeech releases as a postinstall step,
+which are roughly 1.4 GB, it might take a while.
+
+After that, start the server using:
+
+```bash
+npm start
+```
+
+And navigate to [http://localhost:3000](http://localhost:3000)
+
+## Usage
+
+The user interface is quite reduced, after you´ve agreed that the page is allowed
+to use your microphone, you just need to hit the "Listen" button and speak a few words
+into your microphone. After you finished your sentence, you need to hit the "Listening..."
+button once again, so that the recorded sound can be processed.
+After a few seconds, you should see the text in the result box below the button.
--- a/ds.js
+++ b/ds.js
@@ -0,0 +1,56 @@
+const Sox = require('sox-stream')
+const DeepSpeech = require('deepspeech')
+const MemoryStream = require('memory-stream')
+
+module.exports = emitter => {
+  // Beam width used in the CTC decoder when building candidate transcriptions
+  const BEAM_WIDTH = 500
+  // The alpha hyperparameter of the CTC decoder. Language Model weight
+  const LM_WEIGHT = 1.75
+  // The beta hyperparameter of the CTC decoder. Word insertion weight (penalty)
+  const WORD_COUNT_WEIGHT = 1.00
+  // Valid word insertion weight. This is used to lessen the word insertion penalty
+  // when the inserted word is part of the vocabulary
+  const VALID_WORD_COUNT_WEIGHT = 1.00
+
+  // These constants are tied to the shape of the graph used (changing them changes
+  // the geometry of the first layer), so make sure you use the same constants that
+  // were used during training
+
+  // Number of MFCC features to use
+  const N_FEATURES = 26
+  // Size of the context window used for producing timesteps in the input vector
+  const N_CONTEXT = 9
+
+  const MODEL = './models/output_graph.pb'
+  const ALPHABET = './models/alphabet.txt'
+  const LM = './models/lm.binary'
+  const TRIE = './models/trie'
+
+  console.log('Loading model from file %s', MODEL)
+  let model = new DeepSpeech.Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
+  console.log('Finished loading model')
+  console.log('Loading language model from file(s) %s %s', LM, TRIE)
+  model.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
+  console.log('Finished loading langauge model')
+
+  return function (stream) {
+    let audioStream = new MemoryStream()
+    stream.pipe(Sox({
+      output: {
+        bits: 16,
+        rate: 16000,
+        channels: 1,
+        type: 'raw'
+      }
+    })).pipe(audioStream)
+
+    audioStream.on('finish', () => {
+      let audioBuffer = audioStream.toBuffer()
+      console.log('Running inference...')
+      let text = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000)
+      console.log('Inference finished: %s', String(text))
+      emitter.emit('text', {text})
+    })
+  }
+}
--- a/index.js
+++ b/index.js
@@ -0,0 +1,9 @@
+const EventEmitter = require('events')
+const startServer = require('./server')
+const initDeepspeech = require('./ds')
+const lightHook = require('./lightcontroller')
+
+const myEmitter = new EventEmitter()
+const audioStreamCb = initDeepspeech(myEmitter)
+startServer(audioStreamCb, myEmitter)
+lightHook(myEmitter)
--- a/lightcontroller.js
+++ b/lightcontroller.js
@@ -0,0 +1,10 @@
+const http = require('http')
+
+module.exports = function (myEmitter) {
+  myEmitter.on('text', text => {
+    if (text.text.search('ight') > -1) {
+      if (text.text.search('on') > -1) http.get('http://localhost:3434/on')
+      if (text.text.search('of') > -1) http.get('http://localhost:3434/off')
+    }
+  })
+}
--- a/package.json
+++ b/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "deepspeech-demo",
+  "version": "1.0.0",
+  "description": "Demo for Mozilla Deepspeech",
+  "main": "index.js",
+  "scripts": {
+    "start": "node index.js",
+    "postinstall": "node postinstall.js"
+  },
+  "author": "Sebastian Golasch <public@asciidisco.com> (https://asciidisco.com/)",
+  "license": "MIT",
+  "dependencies": {
+    "deepspeech": "0.1.1",
+    "express": "4.16.3",
+    "express-static": "1.2.5",
+    "progress": "2.0.0",
+    "socket.io": "2.1.1",
+    "socket.io-client": "2.1.1",
+    "socket.io-stream": "0.9.1",
+    "tar": "4.4.4",
+    "webrtc-adapter": "6.2.1",
+    "wget-improved": "3.0.1"
+  }
+}
--- a/postinstall.js
+++ b/postinstall.js
@@ -0,0 +1,26 @@
+const fs = require('fs')
+const wget = require('wget-improved')
+const ProgressBar = require('progress')
+const tar = require('tar')
+const src = 'https://github.com/mozilla/DeepSpeech/releases/download/v0.1.1/deepspeech-0.1.1-models.tar.gz'
+const output = './models.tar.gz'
+let bar = null
+console.log('The Demo is now downloading the pre-trained models from %s (roughly 1.4 GB), this will take a few moments...', src)
+let download = wget.download(src, output)
+download.on('error', console.error)
+download.on('progress', progress => bar.tick(progress))
+download.on('start', _ => {
+  bar = new ProgressBar('  downloading [:bar] :percent :etas', {
+    width: 20,
+    total: (100000 / 2)
+  })
+})
+download.on('end', async _ => {
+  bar.tick(100000 / 2)
+  console.log('')
+  console.log('Extracting tar archive...')
+  await tar.x({file: output})
+  console.log('Done extracting archive')
+  console.log('Removinf temporary tar archive...')
+  fs.unlinkSync(output)
+})
--- a/public/cool-background.png
+++ b/public/cool-background.png
--- a/public/favicon.ico
+++ b/public/favicon.ico
--- a/public/index.html
+++ b/public/index.html
@@ -0,0 +1,22 @@
+
+<!doctype html>
+<html>
+<head>
+	<meta name="viewport" content="width=device-width,initial-scale=1">
+	<title>Deepspeech Demo</title>
+	<link rel="stylesheet" type="text/css" href="styles.css"/>
+</head>
+<body>
+	<main>
+		<canvas id="analyser" width="512" height="250"></canvas>
+		<canvas id="wavedisplay" width="512" height="250"></canvas>
+		<button onclick="toggleRecording(this)">Listen</button>
+		<div id="result"></div>
+	</main>
+	<script src="adapter.js"></script>
+	<script src="socket.io.js"></script>
+	<script src="socket.io-stream.js"></script>
+	<script src="recorder.js"></script>
+	<script src="main.js"></script>	
+</body>
+</html>
--- a/public/main.js
+++ b/public/main.js
@@ -0,0 +1,128 @@
+let audioContext = new window.AudioContext()
+let audioInput = null
+let realAudioInput = null
+let inputPoint = null
+let audioRecorder = null
+let socket = null
+let analyserContext = null
+let canvasWidth = null
+let canvasHeight = null
+let analyserNode = null
+
+const drawBuffer = (width, height, context, data) => {
+  const step = Math.ceil(data.length / width)
+  const amp = height / 2
+  context.fillStyle = 'silver'
+  context.clearRect(0, 0, width, height)
+  for (let i = 0; i < width; i++) {
+    let min = 1.0
+    let max = -1.0
+    for (let j = 0; j < step; j++) {
+      let datum = data[(i * step) + j]
+      if (datum < min) min = datum
+      if (datum > max) max = datum
+    }
+    context.fillRect(i, (1 + min) * amp, 1, Math.max(1, (max - min) * amp))
+  }
+}
+
+const gotBuffers = buffers => {
+  let canvas = document.getElementById('wavedisplay')
+  drawBuffer(canvas.width, canvas.height, canvas.getContext('2d'), buffers[0])
+  audioRecorder.exportWAV(doneEncoding)
+}
+
+const doneEncoding = blob => {
+  const stream = window.ss.createStream()
+  document.getElementById('result').textContent = 'Analysing...'
+  window.ss(socket).emit('audio', stream)
+  window.ss.createBlobReadStream(blob).pipe(stream)
+}
+
+const toggleRecording = element => {
+  if (element.classList.contains('recording')) {
+    element.textContent = 'Listen'
+    audioRecorder.stop()
+    element.classList.remove('recording')
+    audioRecorder.getBuffers(gotBuffers)
+    return
+  }
+  element.textContent = 'Listening...'
+  if (!audioRecorder) return
+  element.classList.add('recording')
+  audioRecorder.clear()
+  audioRecorder.record()
+}
+
+const updateAnalysers = time => {
+  if (!analyserContext) {
+    const canvas = document.getElementById('analyser')
+    canvasWidth = canvas.width
+    canvasHeight = canvas.height
+    analyserContext = canvas.getContext('2d')
+  }
+
+  // analyzer draw code here
+  const SPACING = 3
+  const BAR_WIDTH = 1
+  const numBars = Math.round(canvasWidth / SPACING)
+  const freqByteData = new Uint8Array(analyserNode.frequencyBinCount)
+  analyserNode.getByteFrequencyData(freqByteData)
+  analyserContext.clearRect(0, 0, canvasWidth, canvasHeight)
+  analyserContext.fillStyle = '#F6D565'
+  analyserContext.lineCap = 'round'
+  const multiplier = analyserNode.frequencyBinCount / numBars
+
+  // Draw rectangle for each frequency bin.
+  for (let i = 0; i < numBars; ++i) {
+    let magnitude = 0
+    const offset = Math.floor(i * multiplier)
+    // gotta sum/average the block, or we miss narrow-bandwidth spikes
+    for (var j = 0; j < multiplier; j++) magnitude += freqByteData[offset + j]
+    magnitude = magnitude / multiplier
+    analyserContext.fillStyle = `hsl( ${Math.round((i * 360) / numBars)}, 100%, 50%)`
+    analyserContext.fillRect(i * SPACING, canvasHeight, BAR_WIDTH, -magnitude)
+  }
+  window.requestAnimationFrame(updateAnalysers)
+}
+
+const gotStream = stream => {
+  inputPoint = audioContext.createGain()
+  // Create an AudioNode from the stream.
+  realAudioInput = audioContext.createMediaStreamSource(stream)
+  audioInput = realAudioInput
+  audioInput.connect(inputPoint)
+  analyserNode = audioContext.createAnalyser()
+  analyserNode.fftSize = 2048
+  inputPoint.connect(analyserNode)
+  audioRecorder = new window.Recorder(inputPoint)
+  let zeroGain = audioContext.createGain()
+  zeroGain.gain.value = 0.0
+  inputPoint.connect(zeroGain)
+  zeroGain.connect(audioContext.destination)
+  updateAnalysers()
+}
+
+const initAudio = () => {
+  navigator.mediaDevices.getUserMedia({
+    audio: {
+      mandatory: {
+        googEchoCancellation: true,
+        googAutoGainControl: true,
+        googNoiseSuppression: true,
+        googHighpassFilter: true
+      },
+      optional: []
+    }
+  }).then(gotStream).catch(console.error)
+}
+
+const connect = () => {
+  socket = window.io.connect(window.location.origin)
+  window.ss(socket).on('news', (stream, data) => {
+    document.getElementById('result').textContent = data.text
+  })
+}
+
+connect()
+initAudio()
--- a/public/recorder.js
+++ b/public/recorder.js
@@ -0,0 +1,48 @@
+let Recorder = function (source) {
+  const bufferLen = 4096
+  let recording = false
+  let currCallback = null
+
+  this.context = source.context
+  if (!this.context.createScriptProcessor) {
+    this.node = this.context.createJavaScriptNode(bufferLen, 2, 2)
+  } else {
+    this.node = this.context.createScriptProcessor(bufferLen, 2, 2)
+  }
+
+  const worker = new Worker('./recorderWorker.js')
+  worker.postMessage({
+    command: 'init',
+    config: {sampleRate: this.context.sampleRate}
+  })
+
+  this.record = () => { recording = true }
+  this.stop = () => { recording = false }
+  this.clear = () => worker.postMessage({command: 'clear'})
+  this.getBuffers = cb => {
+    currCallback = cb
+    worker.postMessage({command: 'getBuffers'})
+  }
+
+  this.exportWAV = cb => {
+    currCallback = cb
+    worker.postMessage({command: 'exportWAV', type: 'audio/wav'})
+  }
+
+  this.node.onaudioprocess = e => {
+    if (!recording) return
+    worker.postMessage({
+      command: 'record',
+      buffer: [
+        e.inputBuffer.getChannelData(0),
+        e.inputBuffer.getChannelData(1)
+      ]
+    })
+  }
+
+  worker.onmessage = e => currCallback(e.data)
+  source.connect(this.node)
+  this.node.connect(this.context.destination)
+}
+
+window.Recorder = Recorder
--- a/public/recorderWorker.js
+++ b/public/recorderWorker.js
@@ -0,0 +1,111 @@
+let recLength = 0
+let recBuffersL = []
+let recBuffersR = []
+let sampleRate
+
+this.onmessage = e => {
+  if (e.data.command === 'init') init(e.data.config)
+  if (e.data.command === 'record') record(e.data.buffer)
+  if (e.data.command === 'exportWAV') exportWAV(e.data.type)
+  if (e.data.command === 'getBuffers') getBuffers()
+  if (e.data.command === 'clear') clear()
+}
+
+const mergeBuffers = (recBuffers, recLength) => {
+  let result = new Float32Array(recLength)
+  let offset = 0
+  for (let i = 0; i < recBuffers.length; i++) {
+    result.set(recBuffers[i], offset)
+    offset += recBuffers[i].length
+  }
+  return result
+}
+
+const init = config => { sampleRate = config.sampleRate }
+
+const record = inputBuffer => {
+  recBuffersL.push(inputBuffer[0])
+  recBuffersR.push(inputBuffer[1])
+  recLength += inputBuffer[0].length
+}
+
+const exportWAV = type => {
+  const bufferL = mergeBuffers(recBuffersL, recLength)
+  const bufferR = mergeBuffers(recBuffersR, recLength)
+  const interleaved = interleave(bufferL, bufferR)
+  const dataview = encodeWAV(interleaved)
+  const audioBlob = new Blob([dataview], {type})
+  this.postMessage(audioBlob)
+}
+
+const getBuffers = () => {
+  let buffers = []
+  buffers.push(mergeBuffers(recBuffersL, recLength))
+  buffers.push(mergeBuffers(recBuffersR, recLength))
+  this.postMessage(buffers)
+}
+
+const clear = () => {
+  recLength = 0
+  recBuffersL = []
+  recBuffersR = []
+}
+
+const interleave = (inputL, inputR) => {
+  const length = inputL.length + inputR.length
+  let result = new Float32Array(length)
+  let index = 0
+  let inputIndex = 0
+  while (index < length) {
+    result[index++] = inputL[inputIndex]
+    result[index++] = inputR[inputIndex]
+    inputIndex++
+  }
+  return result
+}
+
+const floatTo16BitPCM = (output, offset, input) => {
+  for (let i = 0; i < input.length; i++, offset += 2) {
+    let s = Math.max(-1, Math.min(1, input[i]))
+    output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true)
+  }
+}
+
+const writeString = (view, offset, string) => {
+  for (let i = 0; i < string.length; i++) {
+    view.setUint8(offset + i, string.charCodeAt(i))
+  }
+}
+
+const encodeWAV = samples => {
+  let buffer = new ArrayBuffer(44 + samples.length * 2)
+  let view = new DataView(buffer)
+  /* RIFF identifier */
+  writeString(view, 0, 'RIFF')
+  /* file length */
+  view.setUint32(4, 32 + samples.length * 2, true)
+  /* RIFF type */
+  writeString(view, 8, 'WAVE')
+  /* format chunk identifier */
+  writeString(view, 12, 'fmt ')
+  /* format chunk length */
+  view.setUint32(16, 16, true)
+  /* sample format (raw) */
+  view.setUint16(20, 1, true)
+  /* channel count */
+  view.setUint16(22, 2, true)
+  /* sample rate */
+  view.setUint32(24, sampleRate, true)
+  /* byte rate (sample rate * block align) */
+  view.setUint32(28, sampleRate * 4, true)
+  /* block align (channel count * bytes per sample) */
+  view.setUint16(32, 4, true)
+  /* bits per sample */
+  view.setUint16(34, 16, true)
+  /* data chunk identifier */
+  writeString(view, 36, 'data')
+  /* data chunk length */
+  view.setUint32(40, samples.length * 2, true)
+  floatTo16BitPCM(view, 44, samples)
+  return view
+}
--- a/public/screenshot.png
+++ b/public/screenshot.png
--- a/public/styles.css
+++ b/public/styles.css
@@ -0,0 +1,60 @@
+html {overflow: hidden}
+
+body {
+  font: 14pt Arial, sans-serif;
+  background: url('cool-background.png');
+  display: flex;
+  flex-direction: row;
+  height: 100vh;
+  width: 100%;
+  margin: 0}
+
+canvas {
+  display: flex;
+  align-self: top;
+  background: #202020;
+  width: 50%;
+  height: 25%;
+  margin: 2rem;
+  box-shadow: 0px 0px 10px blue}
+
+main {
+  height: 80%;
+  width: 100%;
+  display: flex;
+  flex-direction: column;
+  justify-content: flex-start;
+  align-items: center}
+
+button {
+  display: inline-block;
+  border-radius: 4px;
+  background-color: #f4511e;
+  border: none;
+  color: #FFFFFF;
+  text-align: center;
+  font-size: 28px;
+  padding: 20px;
+  width: 200px;
+  transition: all 0.5s;
+  cursor: pointer;
+  margin: 5px}
+
+#result {
+  color: #fff;
+  background-color: #333;
+  width: 50%;
+  padding: 2rem;
+  margin-top: 5%;
+  border-radius: 20px}
+
+@media (orientation: landscape) {
+  body {flex-direction: row}
+  #controls {
+    flex-direction: column;
+    height: 100%;
+    width: 10%}
+  main {
+    height: 100%;
+    width: 90%}
+}
--- a/server.js
+++ b/server.js
@@ -0,0 +1,29 @@
+const fs = require('fs')
+const path = require('path')
+const http = require('http')
+const express = require('express')
+const serve = require('express-static')
+const SocketIo = require('socket.io')
+const ss = require('socket.io-stream')
+
+const PORT = 3000
+const app = express()
+const server = http.Server(app)
+const io = SocketIo(server)
+
+module.exports = function (streamCb, myEmitter) {
+  // add socket io client libs from node_modules
+  app.get('/socket.io-stream.js', (req, res) => fs.createReadStream(require.resolve('socket.io-stream/socket.io-stream.js')).pipe(res))
+  app.get('/socket.io.js', (req, res) => fs.createReadStream(require.resolve('socket.io-client/dist/socket.io.js')).pipe(res))
+  app.get('/socket.io.js.map', (req, res) => fs.createReadStream(require.resolve('socket.io-client/dist/socket.io.js.map')).pipe(res))
+  app.get('/adapter.js', (req, res) => fs.createReadStream(require.resolve('webrtc-adapter/out/adapter.js')).pipe(res))
+  // static ressources
+  app.use(serve(path.join(__dirname, 'public')))
+  // configure socket.io stream interface (add callbacks for audio stream & return text)
+  io.on('connection', socket => {
+    ss(socket).on('audio', streamCb)
+    myEmitter.on('text', text => ss(socket).emit('news', ss.createStream(), text))
+  })
+  // start the server
+  server.listen(PORT, () => console.log('Server is running at http://localhost:%s - You´re good to go!', server.address().port))
+}