commit e142078e83ce4821171b35dd54de2c6ceb08543d Author: s.golasch Date: Tue Aug 1 14:20:25 2023 +0200 first commit diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..4a78d57 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e62dfcc --- /dev/null +++ b/.gitignore @@ -0,0 +1,65 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# TypeScript v1 declaration files +typings/ + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env + +# next.js build output +.next + +models/ +models.tar.gz +package-lock.json \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0e39b9f --- /dev/null +++ b/README.md @@ -0,0 +1,51 @@ +# DeepSpeech Demo + +![Screenshot](public/screenshot.png) + +## About + +This is a simple web-interface and Node server that uses [DeepSpeech](https://github.com/mozilla/DeepSpeech) to create a local Speech-to-Text service. + +I mostly built it for myself, in order to play around with DeepSpeech, but it can be used +as a starting point for your own experiments. + +## Prerequisites + +The demo relies on [SoX](http://sox.sourceforge.net/) being available +on your system and in your PATH. + +A quick seach for `how to install SoX on (windows|ubuntu|mac)` should do the trick +for you if you don't know how to install it, at least it did for me. + +## Install + +First, clone the repository: + +```bash +git clone git@github.com:asciidisco/deepspeech-demo.git +``` + +Then change into the directory & perform an npm install: + +```bash +npm install +```` + +As this downloads the pre-trained models from the DeepSpeech releases as a postinstall step, +which are roughly 1.4 GB, it might take a while. + +After that, start the server using: + +```bash +npm start +``` + +And navigate to [http://localhost:3000](http://localhost:3000) + +## Usage + +The user interface is quite reduced, after you´ve agreed that the page is allowed +to use your microphone, you just need to hit the "Listen" button and speak a few words +into your microphone. After you finished your sentence, you need to hit the "Listening..." +button once again, so that the recorded sound can be processed. +After a few seconds, you should see the text in the result box below the button. diff --git a/ds.js b/ds.js new file mode 100644 index 0000000..bf999e5 --- /dev/null +++ b/ds.js @@ -0,0 +1,56 @@ +const Sox = require('sox-stream') +const DeepSpeech = require('deepspeech') +const MemoryStream = require('memory-stream') + +module.exports = emitter => { + // Beam width used in the CTC decoder when building candidate transcriptions + const BEAM_WIDTH = 500 + // The alpha hyperparameter of the CTC decoder. Language Model weight + const LM_WEIGHT = 1.75 + // The beta hyperparameter of the CTC decoder. Word insertion weight (penalty) + const WORD_COUNT_WEIGHT = 1.00 + // Valid word insertion weight. This is used to lessen the word insertion penalty + // when the inserted word is part of the vocabulary + const VALID_WORD_COUNT_WEIGHT = 1.00 + + // These constants are tied to the shape of the graph used (changing them changes + // the geometry of the first layer), so make sure you use the same constants that + // were used during training + + // Number of MFCC features to use + const N_FEATURES = 26 + // Size of the context window used for producing timesteps in the input vector + const N_CONTEXT = 9 + + const MODEL = './models/output_graph.pb' + const ALPHABET = './models/alphabet.txt' + const LM = './models/lm.binary' + const TRIE = './models/trie' + + console.log('Loading model from file %s', MODEL) + let model = new DeepSpeech.Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH) + console.log('Finished loading model') + console.log('Loading language model from file(s) %s %s', LM, TRIE) + model.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) + console.log('Finished loading langauge model') + + return function (stream) { + let audioStream = new MemoryStream() + stream.pipe(Sox({ + output: { + bits: 16, + rate: 16000, + channels: 1, + type: 'raw' + } + })).pipe(audioStream) + + audioStream.on('finish', () => { + let audioBuffer = audioStream.toBuffer() + console.log('Running inference...') + let text = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000) + console.log('Inference finished: %s', String(text)) + emitter.emit('text', {text}) + }) + } +} diff --git a/index.js b/index.js new file mode 100644 index 0000000..9d6a0d2 --- /dev/null +++ b/index.js @@ -0,0 +1,9 @@ +const EventEmitter = require('events') +const startServer = require('./server') +const initDeepspeech = require('./ds') +const lightHook = require('./lightcontroller') + +const myEmitter = new EventEmitter() +const audioStreamCb = initDeepspeech(myEmitter) +startServer(audioStreamCb, myEmitter) +lightHook(myEmitter) diff --git a/lightcontroller.js b/lightcontroller.js new file mode 100644 index 0000000..4e529fa --- /dev/null +++ b/lightcontroller.js @@ -0,0 +1,10 @@ +const http = require('http') + +module.exports = function (myEmitter) { + myEmitter.on('text', text => { + if (text.text.search('ight') > -1) { + if (text.text.search('on') > -1) http.get('http://localhost:3434/on') + if (text.text.search('of') > -1) http.get('http://localhost:3434/off') + } + }) +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..e39cb68 --- /dev/null +++ b/package.json @@ -0,0 +1,24 @@ +{ + "name": "deepspeech-demo", + "version": "1.0.0", + "description": "Demo for Mozilla Deepspeech", + "main": "index.js", + "scripts": { + "start": "node index.js", + "postinstall": "node postinstall.js" + }, + "author": "Sebastian Golasch (https://asciidisco.com/)", + "license": "MIT", + "dependencies": { + "deepspeech": "0.1.1", + "express": "4.16.3", + "express-static": "1.2.5", + "progress": "2.0.0", + "socket.io": "2.1.1", + "socket.io-client": "2.1.1", + "socket.io-stream": "0.9.1", + "tar": "4.4.4", + "webrtc-adapter": "6.2.1", + "wget-improved": "3.0.1" + } +} diff --git a/postinstall.js b/postinstall.js new file mode 100644 index 0000000..5a55aa7 --- /dev/null +++ b/postinstall.js @@ -0,0 +1,26 @@ +const fs = require('fs') +const wget = require('wget-improved') +const ProgressBar = require('progress') +const tar = require('tar') +const src = 'https://github.com/mozilla/DeepSpeech/releases/download/v0.1.1/deepspeech-0.1.1-models.tar.gz' +const output = './models.tar.gz' +let bar = null +console.log('The Demo is now downloading the pre-trained models from %s (roughly 1.4 GB), this will take a few moments...', src) +let download = wget.download(src, output) +download.on('error', console.error) +download.on('progress', progress => bar.tick(progress)) +download.on('start', _ => { + bar = new ProgressBar(' downloading [:bar] :percent :etas', { + width: 20, + total: (100000 / 2) + }) +}) +download.on('end', async _ => { + bar.tick(100000 / 2) + console.log('') + console.log('Extracting tar archive...') + await tar.x({file: output}) + console.log('Done extracting archive') + console.log('Removinf temporary tar archive...') + fs.unlinkSync(output) +}) diff --git a/public/cool-background.png b/public/cool-background.png new file mode 100644 index 0000000..37e99a9 Binary files /dev/null and b/public/cool-background.png differ diff --git a/public/favicon.ico b/public/favicon.ico new file mode 100644 index 0000000..be74abd Binary files /dev/null and b/public/favicon.ico differ diff --git a/public/index.html b/public/index.html new file mode 100644 index 0000000..42ce053 --- /dev/null +++ b/public/index.html @@ -0,0 +1,22 @@ + + + + + + Deepspeech Demo + + + +
+ + + +
+
+ + + + + + + \ No newline at end of file diff --git a/public/main.js b/public/main.js new file mode 100644 index 0000000..b779e48 --- /dev/null +++ b/public/main.js @@ -0,0 +1,128 @@ +let audioContext = new window.AudioContext() +let audioInput = null +let realAudioInput = null +let inputPoint = null +let audioRecorder = null +let socket = null +let analyserContext = null +let canvasWidth = null +let canvasHeight = null +let analyserNode = null + +const drawBuffer = (width, height, context, data) => { + const step = Math.ceil(data.length / width) + const amp = height / 2 + context.fillStyle = 'silver' + context.clearRect(0, 0, width, height) + for (let i = 0; i < width; i++) { + let min = 1.0 + let max = -1.0 + for (let j = 0; j < step; j++) { + let datum = data[(i * step) + j] + if (datum < min) min = datum + if (datum > max) max = datum + } + context.fillRect(i, (1 + min) * amp, 1, Math.max(1, (max - min) * amp)) + } +} + +const gotBuffers = buffers => { + let canvas = document.getElementById('wavedisplay') + drawBuffer(canvas.width, canvas.height, canvas.getContext('2d'), buffers[0]) + audioRecorder.exportWAV(doneEncoding) +} + +const doneEncoding = blob => { + const stream = window.ss.createStream() + document.getElementById('result').textContent = 'Analysing...' + window.ss(socket).emit('audio', stream) + window.ss.createBlobReadStream(blob).pipe(stream) +} + +const toggleRecording = element => { + if (element.classList.contains('recording')) { + element.textContent = 'Listen' + audioRecorder.stop() + element.classList.remove('recording') + audioRecorder.getBuffers(gotBuffers) + return + } + element.textContent = 'Listening...' + if (!audioRecorder) return + element.classList.add('recording') + audioRecorder.clear() + audioRecorder.record() +} + +const updateAnalysers = time => { + if (!analyserContext) { + const canvas = document.getElementById('analyser') + canvasWidth = canvas.width + canvasHeight = canvas.height + analyserContext = canvas.getContext('2d') + } + + // analyzer draw code here + const SPACING = 3 + const BAR_WIDTH = 1 + const numBars = Math.round(canvasWidth / SPACING) + const freqByteData = new Uint8Array(analyserNode.frequencyBinCount) + analyserNode.getByteFrequencyData(freqByteData) + analyserContext.clearRect(0, 0, canvasWidth, canvasHeight) + analyserContext.fillStyle = '#F6D565' + analyserContext.lineCap = 'round' + const multiplier = analyserNode.frequencyBinCount / numBars + + // Draw rectangle for each frequency bin. + for (let i = 0; i < numBars; ++i) { + let magnitude = 0 + const offset = Math.floor(i * multiplier) + // gotta sum/average the block, or we miss narrow-bandwidth spikes + for (var j = 0; j < multiplier; j++) magnitude += freqByteData[offset + j] + magnitude = magnitude / multiplier + analyserContext.fillStyle = `hsl( ${Math.round((i * 360) / numBars)}, 100%, 50%)` + analyserContext.fillRect(i * SPACING, canvasHeight, BAR_WIDTH, -magnitude) + } + window.requestAnimationFrame(updateAnalysers) +} + +const gotStream = stream => { + inputPoint = audioContext.createGain() + // Create an AudioNode from the stream. + realAudioInput = audioContext.createMediaStreamSource(stream) + audioInput = realAudioInput + audioInput.connect(inputPoint) + analyserNode = audioContext.createAnalyser() + analyserNode.fftSize = 2048 + inputPoint.connect(analyserNode) + audioRecorder = new window.Recorder(inputPoint) + let zeroGain = audioContext.createGain() + zeroGain.gain.value = 0.0 + inputPoint.connect(zeroGain) + zeroGain.connect(audioContext.destination) + updateAnalysers() +} + +const initAudio = () => { + navigator.mediaDevices.getUserMedia({ + audio: { + mandatory: { + googEchoCancellation: true, + googAutoGainControl: true, + googNoiseSuppression: true, + googHighpassFilter: true + }, + optional: [] + } + }).then(gotStream).catch(console.error) +} + +const connect = () => { + socket = window.io.connect(window.location.origin) + window.ss(socket).on('news', (stream, data) => { + document.getElementById('result').textContent = data.text + }) +} + +connect() +initAudio() diff --git a/public/recorder.js b/public/recorder.js new file mode 100644 index 0000000..e9aad6e --- /dev/null +++ b/public/recorder.js @@ -0,0 +1,48 @@ +let Recorder = function (source) { + const bufferLen = 4096 + let recording = false + let currCallback = null + + this.context = source.context + if (!this.context.createScriptProcessor) { + this.node = this.context.createJavaScriptNode(bufferLen, 2, 2) + } else { + this.node = this.context.createScriptProcessor(bufferLen, 2, 2) + } + + const worker = new Worker('./recorderWorker.js') + worker.postMessage({ + command: 'init', + config: {sampleRate: this.context.sampleRate} + }) + + this.record = () => { recording = true } + this.stop = () => { recording = false } + this.clear = () => worker.postMessage({command: 'clear'}) + this.getBuffers = cb => { + currCallback = cb + worker.postMessage({command: 'getBuffers'}) + } + + this.exportWAV = cb => { + currCallback = cb + worker.postMessage({command: 'exportWAV', type: 'audio/wav'}) + } + + this.node.onaudioprocess = e => { + if (!recording) return + worker.postMessage({ + command: 'record', + buffer: [ + e.inputBuffer.getChannelData(0), + e.inputBuffer.getChannelData(1) + ] + }) + } + + worker.onmessage = e => currCallback(e.data) + source.connect(this.node) + this.node.connect(this.context.destination) +} + +window.Recorder = Recorder diff --git a/public/recorderWorker.js b/public/recorderWorker.js new file mode 100644 index 0000000..1fc2439 --- /dev/null +++ b/public/recorderWorker.js @@ -0,0 +1,111 @@ +let recLength = 0 +let recBuffersL = [] +let recBuffersR = [] +let sampleRate + +this.onmessage = e => { + if (e.data.command === 'init') init(e.data.config) + if (e.data.command === 'record') record(e.data.buffer) + if (e.data.command === 'exportWAV') exportWAV(e.data.type) + if (e.data.command === 'getBuffers') getBuffers() + if (e.data.command === 'clear') clear() +} + +const mergeBuffers = (recBuffers, recLength) => { + let result = new Float32Array(recLength) + let offset = 0 + for (let i = 0; i < recBuffers.length; i++) { + result.set(recBuffers[i], offset) + offset += recBuffers[i].length + } + return result +} + +const init = config => { sampleRate = config.sampleRate } + +const record = inputBuffer => { + recBuffersL.push(inputBuffer[0]) + recBuffersR.push(inputBuffer[1]) + recLength += inputBuffer[0].length +} + +const exportWAV = type => { + const bufferL = mergeBuffers(recBuffersL, recLength) + const bufferR = mergeBuffers(recBuffersR, recLength) + const interleaved = interleave(bufferL, bufferR) + const dataview = encodeWAV(interleaved) + const audioBlob = new Blob([dataview], {type}) + this.postMessage(audioBlob) +} + +const getBuffers = () => { + let buffers = [] + buffers.push(mergeBuffers(recBuffersL, recLength)) + buffers.push(mergeBuffers(recBuffersR, recLength)) + this.postMessage(buffers) +} + +const clear = () => { + recLength = 0 + recBuffersL = [] + recBuffersR = [] +} + +const interleave = (inputL, inputR) => { + const length = inputL.length + inputR.length + let result = new Float32Array(length) + let index = 0 + let inputIndex = 0 + while (index < length) { + result[index++] = inputL[inputIndex] + result[index++] = inputR[inputIndex] + inputIndex++ + } + return result +} + +const floatTo16BitPCM = (output, offset, input) => { + for (let i = 0; i < input.length; i++, offset += 2) { + let s = Math.max(-1, Math.min(1, input[i])) + output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true) + } +} + +const writeString = (view, offset, string) => { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)) + } +} + +const encodeWAV = samples => { + let buffer = new ArrayBuffer(44 + samples.length * 2) + let view = new DataView(buffer) + /* RIFF identifier */ + writeString(view, 0, 'RIFF') + /* file length */ + view.setUint32(4, 32 + samples.length * 2, true) + /* RIFF type */ + writeString(view, 8, 'WAVE') + /* format chunk identifier */ + writeString(view, 12, 'fmt ') + /* format chunk length */ + view.setUint32(16, 16, true) + /* sample format (raw) */ + view.setUint16(20, 1, true) + /* channel count */ + view.setUint16(22, 2, true) + /* sample rate */ + view.setUint32(24, sampleRate, true) + /* byte rate (sample rate * block align) */ + view.setUint32(28, sampleRate * 4, true) + /* block align (channel count * bytes per sample) */ + view.setUint16(32, 4, true) + /* bits per sample */ + view.setUint16(34, 16, true) + /* data chunk identifier */ + writeString(view, 36, 'data') + /* data chunk length */ + view.setUint32(40, samples.length * 2, true) + floatTo16BitPCM(view, 44, samples) + return view +} diff --git a/public/screenshot.png b/public/screenshot.png new file mode 100644 index 0000000..544ae2e Binary files /dev/null and b/public/screenshot.png differ diff --git a/public/styles.css b/public/styles.css new file mode 100644 index 0000000..5a393ec --- /dev/null +++ b/public/styles.css @@ -0,0 +1,60 @@ +html {overflow: hidden} + +body { + font: 14pt Arial, sans-serif; + background: url('cool-background.png'); + display: flex; + flex-direction: row; + height: 100vh; + width: 100%; + margin: 0} + +canvas { + display: flex; + align-self: top; + background: #202020; + width: 50%; + height: 25%; + margin: 2rem; + box-shadow: 0px 0px 10px blue} + +main { + height: 80%; + width: 100%; + display: flex; + flex-direction: column; + justify-content: flex-start; + align-items: center} + +button { + display: inline-block; + border-radius: 4px; + background-color: #f4511e; + border: none; + color: #FFFFFF; + text-align: center; + font-size: 28px; + padding: 20px; + width: 200px; + transition: all 0.5s; + cursor: pointer; + margin: 5px} + +#result { + color: #fff; + background-color: #333; + width: 50%; + padding: 2rem; + margin-top: 5%; + border-radius: 20px} + +@media (orientation: landscape) { + body {flex-direction: row} + #controls { + flex-direction: column; + height: 100%; + width: 10%} + main { + height: 100%; + width: 90%} +} \ No newline at end of file diff --git a/server.js b/server.js new file mode 100644 index 0000000..21fd041 --- /dev/null +++ b/server.js @@ -0,0 +1,29 @@ +const fs = require('fs') +const path = require('path') +const http = require('http') +const express = require('express') +const serve = require('express-static') +const SocketIo = require('socket.io') +const ss = require('socket.io-stream') + +const PORT = 3000 +const app = express() +const server = http.Server(app) +const io = SocketIo(server) + +module.exports = function (streamCb, myEmitter) { + // add socket io client libs from node_modules + app.get('/socket.io-stream.js', (req, res) => fs.createReadStream(require.resolve('socket.io-stream/socket.io-stream.js')).pipe(res)) + app.get('/socket.io.js', (req, res) => fs.createReadStream(require.resolve('socket.io-client/dist/socket.io.js')).pipe(res)) + app.get('/socket.io.js.map', (req, res) => fs.createReadStream(require.resolve('socket.io-client/dist/socket.io.js.map')).pipe(res)) + app.get('/adapter.js', (req, res) => fs.createReadStream(require.resolve('webrtc-adapter/out/adapter.js')).pipe(res)) + // static ressources + app.use(serve(path.join(__dirname, 'public'))) + // configure socket.io stream interface (add callbacks for audio stream & return text) + io.on('connection', socket => { + ss(socket).on('audio', streamCb) + myEmitter.on('text', text => ss(socket).emit('news', ss.createStream(), text)) + }) + // start the server + server.listen(PORT, () => console.log('Server is running at http://localhost:%s - You´re good to go!', server.address().port)) +}