first commit

This commit is contained in:
s.golasch
2023-08-01 14:20:25 +02:00
commit e142078e83
17 changed files with 639 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

65
.gitignore vendored Normal file
View File

@@ -0,0 +1,65 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
# next.js build output
.next
models/
models.tar.gz
package-lock.json

51
README.md Normal file
View File

@@ -0,0 +1,51 @@
# DeepSpeech Demo
![Screenshot](public/screenshot.png)
## About
This is a simple web-interface and Node server that uses [DeepSpeech](https://github.com/mozilla/DeepSpeech) to create a local Speech-to-Text service.
I mostly built it for myself, in order to play around with DeepSpeech, but it can be used
as a starting point for your own experiments.
## Prerequisites
The demo relies on [SoX](http://sox.sourceforge.net/) being available
on your system and in your PATH.
A quick seach for `how to install SoX on (windows|ubuntu|mac)` should do the trick
for you if you don't know how to install it, at least it did for me.
## Install
First, clone the repository:
```bash
git clone git@github.com:asciidisco/deepspeech-demo.git
```
Then change into the directory & perform an npm install:
```bash
npm install
````
As this downloads the pre-trained models from the DeepSpeech releases as a postinstall step,
which are roughly 1.4 GB, it might take a while.
After that, start the server using:
```bash
npm start
```
And navigate to [http://localhost:3000](http://localhost:3000)
## Usage
The user interface is quite reduced, after you´ve agreed that the page is allowed
to use your microphone, you just need to hit the "Listen" button and speak a few words
into your microphone. After you finished your sentence, you need to hit the "Listening..."
button once again, so that the recorded sound can be processed.
After a few seconds, you should see the text in the result box below the button.

56
ds.js Normal file
View File

@@ -0,0 +1,56 @@
const Sox = require('sox-stream')
const DeepSpeech = require('deepspeech')
const MemoryStream = require('memory-stream')
module.exports = emitter => {
// Beam width used in the CTC decoder when building candidate transcriptions
const BEAM_WIDTH = 500
// The alpha hyperparameter of the CTC decoder. Language Model weight
const LM_WEIGHT = 1.75
// The beta hyperparameter of the CTC decoder. Word insertion weight (penalty)
const WORD_COUNT_WEIGHT = 1.00
// Valid word insertion weight. This is used to lessen the word insertion penalty
// when the inserted word is part of the vocabulary
const VALID_WORD_COUNT_WEIGHT = 1.00
// These constants are tied to the shape of the graph used (changing them changes
// the geometry of the first layer), so make sure you use the same constants that
// were used during training
// Number of MFCC features to use
const N_FEATURES = 26
// Size of the context window used for producing timesteps in the input vector
const N_CONTEXT = 9
const MODEL = './models/output_graph.pb'
const ALPHABET = './models/alphabet.txt'
const LM = './models/lm.binary'
const TRIE = './models/trie'
console.log('Loading model from file %s', MODEL)
let model = new DeepSpeech.Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH)
console.log('Finished loading model')
console.log('Loading language model from file(s) %s %s', LM, TRIE)
model.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
console.log('Finished loading langauge model')
return function (stream) {
let audioStream = new MemoryStream()
stream.pipe(Sox({
output: {
bits: 16,
rate: 16000,
channels: 1,
type: 'raw'
}
})).pipe(audioStream)
audioStream.on('finish', () => {
let audioBuffer = audioStream.toBuffer()
console.log('Running inference...')
let text = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000)
console.log('Inference finished: %s', String(text))
emitter.emit('text', {text})
})
}
}

9
index.js Normal file
View File

@@ -0,0 +1,9 @@
const EventEmitter = require('events')
const startServer = require('./server')
const initDeepspeech = require('./ds')
const lightHook = require('./lightcontroller')
const myEmitter = new EventEmitter()
const audioStreamCb = initDeepspeech(myEmitter)
startServer(audioStreamCb, myEmitter)
lightHook(myEmitter)

10
lightcontroller.js Normal file
View File

@@ -0,0 +1,10 @@
const http = require('http')
module.exports = function (myEmitter) {
myEmitter.on('text', text => {
if (text.text.search('ight') > -1) {
if (text.text.search('on') > -1) http.get('http://localhost:3434/on')
if (text.text.search('of') > -1) http.get('http://localhost:3434/off')
}
})
}

24
package.json Normal file
View File

@@ -0,0 +1,24 @@
{
"name": "deepspeech-demo",
"version": "1.0.0",
"description": "Demo for Mozilla Deepspeech",
"main": "index.js",
"scripts": {
"start": "node index.js",
"postinstall": "node postinstall.js"
},
"author": "Sebastian Golasch <public@asciidisco.com> (https://asciidisco.com/)",
"license": "MIT",
"dependencies": {
"deepspeech": "0.1.1",
"express": "4.16.3",
"express-static": "1.2.5",
"progress": "2.0.0",
"socket.io": "2.1.1",
"socket.io-client": "2.1.1",
"socket.io-stream": "0.9.1",
"tar": "4.4.4",
"webrtc-adapter": "6.2.1",
"wget-improved": "3.0.1"
}
}

26
postinstall.js Normal file
View File

@@ -0,0 +1,26 @@
const fs = require('fs')
const wget = require('wget-improved')
const ProgressBar = require('progress')
const tar = require('tar')
const src = 'https://github.com/mozilla/DeepSpeech/releases/download/v0.1.1/deepspeech-0.1.1-models.tar.gz'
const output = './models.tar.gz'
let bar = null
console.log('The Demo is now downloading the pre-trained models from %s (roughly 1.4 GB), this will take a few moments...', src)
let download = wget.download(src, output)
download.on('error', console.error)
download.on('progress', progress => bar.tick(progress))
download.on('start', _ => {
bar = new ProgressBar(' downloading [:bar] :percent :etas', {
width: 20,
total: (100000 / 2)
})
})
download.on('end', async _ => {
bar.tick(100000 / 2)
console.log('')
console.log('Extracting tar archive...')
await tar.x({file: output})
console.log('Done extracting archive')
console.log('Removinf temporary tar archive...')
fs.unlinkSync(output)
})

BIN
public/cool-background.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

BIN
public/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 766 B

22
public/index.html Normal file
View File

@@ -0,0 +1,22 @@
<!doctype html>
<html>
<head>
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Deepspeech Demo</title>
<link rel="stylesheet" type="text/css" href="styles.css"/>
</head>
<body>
<main>
<canvas id="analyser" width="512" height="250"></canvas>
<canvas id="wavedisplay" width="512" height="250"></canvas>
<button onclick="toggleRecording(this)">Listen</button>
<div id="result"></div>
</main>
<script src="adapter.js"></script>
<script src="socket.io.js"></script>
<script src="socket.io-stream.js"></script>
<script src="recorder.js"></script>
<script src="main.js"></script>
</body>
</html>

128
public/main.js Normal file
View File

@@ -0,0 +1,128 @@
let audioContext = new window.AudioContext()
let audioInput = null
let realAudioInput = null
let inputPoint = null
let audioRecorder = null
let socket = null
let analyserContext = null
let canvasWidth = null
let canvasHeight = null
let analyserNode = null
const drawBuffer = (width, height, context, data) => {
const step = Math.ceil(data.length / width)
const amp = height / 2
context.fillStyle = 'silver'
context.clearRect(0, 0, width, height)
for (let i = 0; i < width; i++) {
let min = 1.0
let max = -1.0
for (let j = 0; j < step; j++) {
let datum = data[(i * step) + j]
if (datum < min) min = datum
if (datum > max) max = datum
}
context.fillRect(i, (1 + min) * amp, 1, Math.max(1, (max - min) * amp))
}
}
const gotBuffers = buffers => {
let canvas = document.getElementById('wavedisplay')
drawBuffer(canvas.width, canvas.height, canvas.getContext('2d'), buffers[0])
audioRecorder.exportWAV(doneEncoding)
}
const doneEncoding = blob => {
const stream = window.ss.createStream()
document.getElementById('result').textContent = 'Analysing...'
window.ss(socket).emit('audio', stream)
window.ss.createBlobReadStream(blob).pipe(stream)
}
const toggleRecording = element => {
if (element.classList.contains('recording')) {
element.textContent = 'Listen'
audioRecorder.stop()
element.classList.remove('recording')
audioRecorder.getBuffers(gotBuffers)
return
}
element.textContent = 'Listening...'
if (!audioRecorder) return
element.classList.add('recording')
audioRecorder.clear()
audioRecorder.record()
}
const updateAnalysers = time => {
if (!analyserContext) {
const canvas = document.getElementById('analyser')
canvasWidth = canvas.width
canvasHeight = canvas.height
analyserContext = canvas.getContext('2d')
}
// analyzer draw code here
const SPACING = 3
const BAR_WIDTH = 1
const numBars = Math.round(canvasWidth / SPACING)
const freqByteData = new Uint8Array(analyserNode.frequencyBinCount)
analyserNode.getByteFrequencyData(freqByteData)
analyserContext.clearRect(0, 0, canvasWidth, canvasHeight)
analyserContext.fillStyle = '#F6D565'
analyserContext.lineCap = 'round'
const multiplier = analyserNode.frequencyBinCount / numBars
// Draw rectangle for each frequency bin.
for (let i = 0; i < numBars; ++i) {
let magnitude = 0
const offset = Math.floor(i * multiplier)
// gotta sum/average the block, or we miss narrow-bandwidth spikes
for (var j = 0; j < multiplier; j++) magnitude += freqByteData[offset + j]
magnitude = magnitude / multiplier
analyserContext.fillStyle = `hsl( ${Math.round((i * 360) / numBars)}, 100%, 50%)`
analyserContext.fillRect(i * SPACING, canvasHeight, BAR_WIDTH, -magnitude)
}
window.requestAnimationFrame(updateAnalysers)
}
const gotStream = stream => {
inputPoint = audioContext.createGain()
// Create an AudioNode from the stream.
realAudioInput = audioContext.createMediaStreamSource(stream)
audioInput = realAudioInput
audioInput.connect(inputPoint)
analyserNode = audioContext.createAnalyser()
analyserNode.fftSize = 2048
inputPoint.connect(analyserNode)
audioRecorder = new window.Recorder(inputPoint)
let zeroGain = audioContext.createGain()
zeroGain.gain.value = 0.0
inputPoint.connect(zeroGain)
zeroGain.connect(audioContext.destination)
updateAnalysers()
}
const initAudio = () => {
navigator.mediaDevices.getUserMedia({
audio: {
mandatory: {
googEchoCancellation: true,
googAutoGainControl: true,
googNoiseSuppression: true,
googHighpassFilter: true
},
optional: []
}
}).then(gotStream).catch(console.error)
}
const connect = () => {
socket = window.io.connect(window.location.origin)
window.ss(socket).on('news', (stream, data) => {
document.getElementById('result').textContent = data.text
})
}
connect()
initAudio()

48
public/recorder.js Normal file
View File

@@ -0,0 +1,48 @@
let Recorder = function (source) {
const bufferLen = 4096
let recording = false
let currCallback = null
this.context = source.context
if (!this.context.createScriptProcessor) {
this.node = this.context.createJavaScriptNode(bufferLen, 2, 2)
} else {
this.node = this.context.createScriptProcessor(bufferLen, 2, 2)
}
const worker = new Worker('./recorderWorker.js')
worker.postMessage({
command: 'init',
config: {sampleRate: this.context.sampleRate}
})
this.record = () => { recording = true }
this.stop = () => { recording = false }
this.clear = () => worker.postMessage({command: 'clear'})
this.getBuffers = cb => {
currCallback = cb
worker.postMessage({command: 'getBuffers'})
}
this.exportWAV = cb => {
currCallback = cb
worker.postMessage({command: 'exportWAV', type: 'audio/wav'})
}
this.node.onaudioprocess = e => {
if (!recording) return
worker.postMessage({
command: 'record',
buffer: [
e.inputBuffer.getChannelData(0),
e.inputBuffer.getChannelData(1)
]
})
}
worker.onmessage = e => currCallback(e.data)
source.connect(this.node)
this.node.connect(this.context.destination)
}
window.Recorder = Recorder

111
public/recorderWorker.js Normal file
View File

@@ -0,0 +1,111 @@
let recLength = 0
let recBuffersL = []
let recBuffersR = []
let sampleRate
this.onmessage = e => {
if (e.data.command === 'init') init(e.data.config)
if (e.data.command === 'record') record(e.data.buffer)
if (e.data.command === 'exportWAV') exportWAV(e.data.type)
if (e.data.command === 'getBuffers') getBuffers()
if (e.data.command === 'clear') clear()
}
const mergeBuffers = (recBuffers, recLength) => {
let result = new Float32Array(recLength)
let offset = 0
for (let i = 0; i < recBuffers.length; i++) {
result.set(recBuffers[i], offset)
offset += recBuffers[i].length
}
return result
}
const init = config => { sampleRate = config.sampleRate }
const record = inputBuffer => {
recBuffersL.push(inputBuffer[0])
recBuffersR.push(inputBuffer[1])
recLength += inputBuffer[0].length
}
const exportWAV = type => {
const bufferL = mergeBuffers(recBuffersL, recLength)
const bufferR = mergeBuffers(recBuffersR, recLength)
const interleaved = interleave(bufferL, bufferR)
const dataview = encodeWAV(interleaved)
const audioBlob = new Blob([dataview], {type})
this.postMessage(audioBlob)
}
const getBuffers = () => {
let buffers = []
buffers.push(mergeBuffers(recBuffersL, recLength))
buffers.push(mergeBuffers(recBuffersR, recLength))
this.postMessage(buffers)
}
const clear = () => {
recLength = 0
recBuffersL = []
recBuffersR = []
}
const interleave = (inputL, inputR) => {
const length = inputL.length + inputR.length
let result = new Float32Array(length)
let index = 0
let inputIndex = 0
while (index < length) {
result[index++] = inputL[inputIndex]
result[index++] = inputR[inputIndex]
inputIndex++
}
return result
}
const floatTo16BitPCM = (output, offset, input) => {
for (let i = 0; i < input.length; i++, offset += 2) {
let s = Math.max(-1, Math.min(1, input[i]))
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true)
}
}
const writeString = (view, offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i))
}
}
const encodeWAV = samples => {
let buffer = new ArrayBuffer(44 + samples.length * 2)
let view = new DataView(buffer)
/* RIFF identifier */
writeString(view, 0, 'RIFF')
/* file length */
view.setUint32(4, 32 + samples.length * 2, true)
/* RIFF type */
writeString(view, 8, 'WAVE')
/* format chunk identifier */
writeString(view, 12, 'fmt ')
/* format chunk length */
view.setUint32(16, 16, true)
/* sample format (raw) */
view.setUint16(20, 1, true)
/* channel count */
view.setUint16(22, 2, true)
/* sample rate */
view.setUint32(24, sampleRate, true)
/* byte rate (sample rate * block align) */
view.setUint32(28, sampleRate * 4, true)
/* block align (channel count * bytes per sample) */
view.setUint16(32, 4, true)
/* bits per sample */
view.setUint16(34, 16, true)
/* data chunk identifier */
writeString(view, 36, 'data')
/* data chunk length */
view.setUint32(40, samples.length * 2, true)
floatTo16BitPCM(view, 44, samples)
return view
}

BIN
public/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

60
public/styles.css Normal file
View File

@@ -0,0 +1,60 @@
html {overflow: hidden}
body {
font: 14pt Arial, sans-serif;
background: url('cool-background.png');
display: flex;
flex-direction: row;
height: 100vh;
width: 100%;
margin: 0}
canvas {
display: flex;
align-self: top;
background: #202020;
width: 50%;
height: 25%;
margin: 2rem;
box-shadow: 0px 0px 10px blue}
main {
height: 80%;
width: 100%;
display: flex;
flex-direction: column;
justify-content: flex-start;
align-items: center}
button {
display: inline-block;
border-radius: 4px;
background-color: #f4511e;
border: none;
color: #FFFFFF;
text-align: center;
font-size: 28px;
padding: 20px;
width: 200px;
transition: all 0.5s;
cursor: pointer;
margin: 5px}
#result {
color: #fff;
background-color: #333;
width: 50%;
padding: 2rem;
margin-top: 5%;
border-radius: 20px}
@media (orientation: landscape) {
body {flex-direction: row}
#controls {
flex-direction: column;
height: 100%;
width: 10%}
main {
height: 100%;
width: 90%}
}

29
server.js Normal file
View File

@@ -0,0 +1,29 @@
const fs = require('fs')
const path = require('path')
const http = require('http')
const express = require('express')
const serve = require('express-static')
const SocketIo = require('socket.io')
const ss = require('socket.io-stream')
const PORT = 3000
const app = express()
const server = http.Server(app)
const io = SocketIo(server)
module.exports = function (streamCb, myEmitter) {
// add socket io client libs from node_modules
app.get('/socket.io-stream.js', (req, res) => fs.createReadStream(require.resolve('socket.io-stream/socket.io-stream.js')).pipe(res))
app.get('/socket.io.js', (req, res) => fs.createReadStream(require.resolve('socket.io-client/dist/socket.io.js')).pipe(res))
app.get('/socket.io.js.map', (req, res) => fs.createReadStream(require.resolve('socket.io-client/dist/socket.io.js.map')).pipe(res))
app.get('/adapter.js', (req, res) => fs.createReadStream(require.resolve('webrtc-adapter/out/adapter.js')).pipe(res))
// static ressources
app.use(serve(path.join(__dirname, 'public')))
// configure socket.io stream interface (add callbacks for audio stream & return text)
io.on('connection', socket => {
ss(socket).on('audio', streamCb)
myEmitter.on('text', text => ss(socket).emit('news', ss.createStream(), text))
})
// start the server
server.listen(PORT, () => console.log('Server is running at http://localhost:%s - You´re good to go!', server.address().port))
}