feat: add global voice input with push-to-talk transcription (0.0.8)

This commit is contained in:
Timothy Jaeryang Baek
2026-04-11 15:16:37 -06:00
parent 13dfb0f779
commit 4db0faff97
14 changed files with 864 additions and 13 deletions
+12
View File
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.0.8] - 2026-04-11
### Added
- **Voice Input.** System-wide push-to-talk voice transcription. Press the shortcut from any app to record audio, which is automatically transcribed and sent to your active chat.
- **Voice Input Settings.** Configurable global hotkey and enable/disable toggle in Settings, with a default of Shift+Cmd+Space (macOS) or Shift+Ctrl+Space (Windows/Linux).
- **Audio Feedback.** Bundled start and stop chime sounds play when recording begins and ends.
### Fixed
- **Shortcut Recorder on macOS.** Shortcut inputs now use physical key codes instead of character values, fixing Alt key combinations producing unicode characters like √ instead of V.
## [0.0.7] - 2026-04-11
### Fixed
+4 -2
View File
@@ -15,7 +15,8 @@ export default defineConfig({
input: {
index: resolve(__dirname, 'src/preload/index.ts'),
'content-preload': resolve(__dirname, 'src/preload/content-preload.ts'),
'spotlight-preload': resolve(__dirname, 'src/preload/spotlight-preload.ts')
'spotlight-preload': resolve(__dirname, 'src/preload/spotlight-preload.ts'),
'voice-input-preload': resolve(__dirname, 'src/preload/voice-input-preload.ts')
}
}
}
@@ -25,7 +26,8 @@ export default defineConfig({
rollupOptions: {
input: {
index: resolve(__dirname, 'src/renderer/index.html'),
spotlight: resolve(__dirname, 'src/renderer/spotlight.html')
spotlight: resolve(__dirname, 'src/renderer/spotlight.html'),
'voice-input': resolve(__dirname, 'src/renderer/voice-input.html')
}
}
},
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "open-webui",
"version": "0.0.7",
"version": "0.0.8",
"license": "AGPL-3.0",
"description": "Open WebUI Desktop",
"main": "./out/main/index.js",
Binary file not shown.
Binary file not shown.
+303 -3
View File
@@ -98,6 +98,7 @@ if (process.platform === 'linux') {
let mainWindow: BrowserWindow | null = null
let contentWindow: BrowserWindow | null = null
let spotlightWindow: BrowserWindow | null = null
let voiceInputWindow: BrowserWindow | null = null
let tray: Tray | null = null
let isQuiting = false
@@ -106,10 +107,12 @@ let SERVER_URL: string | null = null
let SERVER_STATUS: string | null = null
let SERVER_REACHABLE = false
let SERVER_PID: number | null = null
let AUTH_TOKEN: string | null = null
let voiceInputRecording = false
// ─── Global Shortcuts ───────────────────────────────────
const registerShortcuts = (globalAccel?: string, spotlightAccel?: string): void => {
const registerShortcuts = (globalAccel?: string, spotlightAccel?: string, voiceInputAccel?: string): void => {
globalShortcut.unregisterAll()
// Global shortcut bring main window to foreground
@@ -139,6 +142,20 @@ const registerShortcuts = (globalAccel?: string, spotlightAccel?: string): void
log.warn('Failed to register spotlight shortcut:', spotlightAccel, error)
}
}
// Voice input shortcut toggle microphone recording
if (voiceInputAccel && CONFIG?.voiceInputEnabled !== false) {
try {
const ok = globalShortcut.register(voiceInputAccel, () => {
toggleVoiceInput()
})
log.info(`Voice input shortcut "${voiceInputAccel}" registered: ${ok}`)
} catch (error) {
log.warn('Failed to register voice input shortcut:', voiceInputAccel, error)
}
} else {
log.info(`Voice input shortcut skipped — accel="${voiceInputAccel}", enabled=${CONFIG?.voiceInputEnabled}`)
}
}
// ─── Spotlight Window ───────────────────────────────────
@@ -257,6 +274,122 @@ function toggleSpotlight(selectedText?: string): void {
}
}
// ─── Voice Input Window ─────────────────────────────────
function createVoiceInputWindow(): BrowserWindow {
const { screen } = require('electron')
const cursorPoint = screen.getCursorScreenPoint()
const activeDisplay = screen.getDisplayNearestPoint(cursorPoint)
const { x: sx, y: sy, width: sw } = activeDisplay.bounds
const winW = 340
const winH = 72
voiceInputWindow = new BrowserWindow({
x: sx + Math.round((sw - winW) / 2),
y: sy + 120,
width: winW,
height: winH,
frame: false,
transparent: true,
alwaysOnTop: true,
skipTaskbar: true,
resizable: false,
hasShadow: false,
show: false,
focusable: true,
icon: path.join(__dirname, 'assets/icon.png'),
webPreferences: {
preload: join(__dirname, '../preload/voice-input-preload.js'),
sandbox: false,
webviewTag: false,
autoplayPolicy: 'no-user-gesture-required'
}
})
// Grant microphone permission for the voice input window
voiceInputWindow.webContents.session.setPermissionRequestHandler(
(_webContents, permission, callback) => {
callback(permission === 'media')
}
)
if (is.dev && process.env['ELECTRON_RENDERER_URL']) {
voiceInputWindow.loadURL(`${process.env['ELECTRON_RENDERER_URL']}/voice-input.html`)
} else {
voiceInputWindow.loadFile(join(__dirname, '../renderer/voice-input.html'))
}
voiceInputWindow.on('closed', () => {
voiceInputWindow = null
voiceInputRecording = false
})
return voiceInputWindow
}
function playChime(ascending: boolean): Promise<void> {
return new Promise((resolve) => {
const { execFile } = require('child_process')
const fs = require('fs')
const file = ascending ? 'chime-start.wav' : 'chime-stop.wav'
const soundPath = app.isPackaged
? join(process.resourcesPath, 'app.asar.unpacked', 'resources', 'sounds', file)
: join(app.getAppPath(), 'resources', 'sounds', file)
const exists = fs.existsSync(soundPath)
log.info(`playChime: ${ascending ? 'start' : 'stop'}, path=${soundPath}, exists=${exists}`)
if (!exists) { resolve(); return }
if (process.platform === 'darwin') {
execFile('afplay', [soundPath], (err, stdout, stderr) => {
if (err) log.warn('afplay error:', err.message, stderr)
resolve()
})
} else if (process.platform === 'win32') {
execFile('powershell', ['-NoProfile', '-Command',
`(New-Object Media.SoundPlayer '${soundPath}').PlaySync()`
], () => resolve())
} else {
execFile('paplay', [soundPath], (err) => {
if (err) execFile('aplay', [soundPath], () => resolve())
else resolve()
})
}
})
}
async function toggleVoiceInput(): Promise<void> {
if (voiceInputRecording) {
// Stop recording — chime plays in done/close handler after mic is released
voiceInputRecording = false
if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
voiceInputWindow.webContents.send('voiceInput:state', { recording: false })
}
return
}
// Start recording — chime plays concurrently (separate audio output path from mic input)
voiceInputRecording = true
playChime(true)
if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
voiceInputWindow.show()
voiceInputWindow.focus()
voiceInputWindow.webContents.send('voiceInput:state', { recording: true })
} else {
const win = createVoiceInputWindow()
win.once('ready-to-show', () => {
win.show()
win.focus()
setTimeout(() => {
win.webContents.send('voiceInput:state', { recording: true })
}, 100)
})
}
}
// ─── Windows ────────────────────────────────────────────
function createMainWindow(show = true): void {
@@ -790,7 +923,8 @@ if (!gotTheLock) {
await setConfig(config)
CONFIG = await getConfig()
updateTray()
registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut)
voiceInputRecording = false
registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut, CONFIG.voiceInputShortcut)
})
// Python/uv
@@ -941,6 +1075,12 @@ if (!gotTheLock) {
}
})
// Auth token relay from webview
ipcMain.handle('app:setAuthToken', (_event, token: string) => {
AUTH_TOKEN = token || null
log.info('Auth token updated from webview')
})
// Misc
ipcMain.handle('app:reset', () => resetAppHandler())
@@ -1076,6 +1216,162 @@ if (!gotTheLock) {
}
)
// ── Voice Input ─────────────────────────────────────
// Check microphone permission (macOS)
ipcMain.handle('voiceInput:micPermission', async () => {
if (process.platform === 'darwin') {
const status = systemPreferences.getMediaAccessStatus('microphone')
if (status !== 'granted') {
const granted = await systemPreferences.askForMediaAccess('microphone')
return granted ? 'granted' : 'denied'
}
return 'granted'
}
return 'granted' // Windows/Linux don't need explicit permission
})
// Transcribe audio via the connected server's STT endpoint
ipcMain.handle('voiceInput:transcribe', async (_event, audioBuffer: ArrayBuffer, rendererToken?: string) => {
try {
const config = await getConfig()
if (!config.defaultConnectionId || config.connections.length === 0) {
throw new Error('No connection configured')
}
const conn = config.connections.find((c) => c.id === config.defaultConnectionId)
if (!conn) throw new Error('Default connection not found')
let url = conn.url
if (conn.type === 'local' && SERVER_URL) {
url = SERVER_URL
}
if (url.startsWith('http://0.0.0.0')) {
url = url.replace('http://0.0.0.0', 'http://localhost')
}
// Use stored auth token (relayed from webview), fall back to renderer-provided or contentWindow
let token = AUTH_TOKEN || rendererToken || ''
if (!token) {
// Scan all webContents to find the Open WebUI webview and read its token
try {
const { webContents: wc } = require('electron')
const allContents = wc.getAllWebContents()
for (const contents of allContents) {
try {
if (contents.getType() === 'webview' && !contents.isDestroyed()) {
const t = await contents.executeJavaScript(
`localStorage.getItem('token') || ''`
)
if (t) { token = t; break }
}
} catch {
// Skip inaccessible webContents
}
}
} catch {
log.warn('voiceInput:transcribe — could not extract token from webviews')
}
}
if (!token) {
throw new Error('Not authenticated — open a connection first')
}
// Build multipart form data manually using Node.js
const boundary = '----VoiceInput' + Date.now()
const buffer = Buffer.from(audioBuffer)
const filename = `recording-${Date.now()}.wav`
const header = [
`--${boundary}`,
`Content-Disposition: form-data; name="file"; filename="${filename}"`,
`Content-Type: audio/wav`,
'',
''
].join('\r\n')
const footer = `\r\n--${boundary}--\r\n`
const headerBuf = Buffer.from(header, 'utf-8')
const footerBuf = Buffer.from(footer, 'utf-8')
const body = Buffer.concat([headerBuf, buffer, footerBuf])
const response = await fetch(`${url}/api/v1/audio/transcriptions`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${token}`,
'Content-Type': `multipart/form-data; boundary=${boundary}`
},
body
})
if (!response.ok) {
const text = await response.text().catch(() => '')
throw new Error(`Transcription failed (${response.status}): ${text}`)
}
const result = await response.json()
return result
} catch (error: any) {
log.error('voiceInput:transcribe failed:', error)
throw error
}
})
// Voice input completed — deliver text to chat
ipcMain.handle('voiceInput:done', async (_event, text: string) => {
voiceInputRecording = false
playChime(false)
if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
voiceInputWindow.hide()
}
if (!text?.trim()) return
// Deliver text through the same path as Spotlight
const config = await getConfig()
if (!config.defaultConnectionId || config.connections.length === 0) {
mainWindow?.show()
mainWindow?.focus()
return
}
const conn = config.connections.find((c) => c.id === config.defaultConnectionId)
if (!conn) {
mainWindow?.show()
mainWindow?.focus()
return
}
let url = conn.url
if (conn.type === 'local' && SERVER_URL) {
url = SERVER_URL
}
if (url.startsWith('http://0.0.0.0')) {
url = url.replace('http://0.0.0.0', 'http://localhost')
}
sendToRenderer('query', { query: text.trim(), connectionId: conn.id, url })
if (mainWindow && !mainWindow.isDestroyed()) {
mainWindow.show()
mainWindow.focus()
}
})
// Voice input window requests close
ipcMain.handle('voiceInput:close', () => {
voiceInputRecording = false
playChime(false)
if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
voiceInputWindow.hide()
}
})
// Voice input error
ipcMain.handle('voiceInput:error', (_event, message: string) => {
log.warn('Voice input error:', message)
voiceInputRecording = false
})
// Open Terminal
ipcMain.handle('open-terminal:start', async () => {
try {
@@ -1331,7 +1627,7 @@ if (!gotTheLock) {
// Global shortcut
registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut)
registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut, CONFIG.voiceInputShortcut)
// Enable screen capture
session.defaultSession.setDisplayMediaRequestHandler(
@@ -1423,6 +1719,10 @@ if (!gotTheLock) {
spotlightWindow.destroy()
}
spotlightWindow = null
if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
voiceInputWindow.destroy()
}
voiceInputWindow = null
tray?.destroy()
tray = null
})
+5 -1
View File
@@ -828,6 +828,8 @@ export interface AppConfig {
envVars: Record<string, string>
showSidebar: boolean
spotlightPosition: { x: number; y: number } | null
voiceInputShortcut: string
voiceInputEnabled: boolean
}
const DEFAULT_CONFIG: AppConfig = {
@@ -856,7 +858,9 @@ const DEFAULT_CONFIG: AppConfig = {
},
envVars: {},
showSidebar: false,
spotlightPosition: null
spotlightPosition: null,
voiceInputShortcut: 'Shift+CommandOrControl+Space',
voiceInputEnabled: true
}
export const getConfig = async (): Promise<AppConfig> => {
+4 -1
View File
@@ -181,7 +181,10 @@ const api = {
installUpdate: () => ipcRenderer.invoke('updater:install'),
// Changelog
getChangelog: () => ipcRenderer.invoke('app:changelog')
getChangelog: () => ipcRenderer.invoke('app:changelog'),
// Auth token relay from webview
setAuthToken: (token: string) => ipcRenderer.invoke('app:setAuthToken', token)
}
if (process.contextIsolated) {
+43
View File
@@ -0,0 +1,43 @@
import { ipcRenderer, contextBridge } from 'electron'
const api = {
// Main process tells us to start/stop recording
onRecordingState: (
callback: (data: { recording: boolean }) => void
): void => {
ipcRenderer.on('voiceInput:state', (_event, data) => {
callback(data)
})
},
// Send recorded audio to main process for transcription
transcribe: (audioBuffer: ArrayBuffer, token?: string): Promise<any> => {
return ipcRenderer.invoke('voiceInput:transcribe', audioBuffer, token)
},
// Notify main process that transcription completed
done: (text: string): void => {
ipcRenderer.invoke('voiceInput:done', text)
},
// Close/hide the voice input window
close: (): void => {
ipcRenderer.invoke('voiceInput:close')
},
// Report an error
error: (message: string): void => {
ipcRenderer.invoke('voiceInput:error', message)
}
}
if (process.contextIsolated) {
try {
contextBridge.exposeInMainWorld('voiceInputAPI', api)
} catch (error) {
console.error(error)
}
} else {
// @ts-ignore
window.voiceInputAPI = api
}
@@ -0,0 +1,320 @@
<script lang="ts">
import { onMount, onDestroy } from 'svelte'
const api = window.voiceInputAPI
let recording = $state(false)
let transcribing = $state(false)
let duration = $state(0)
let errorMsg = $state('')
// Waveform
let levels: number[] = $state(Array(5).fill(0.15))
let animFrame: number | null = null
let timer: ReturnType<typeof setInterval> | null = null
let errorTimer: ReturnType<typeof setTimeout> | null = null
// Audio
let mediaRecorder: MediaRecorder | null = null
let audioChunks: Blob[] = []
let mediaStream: MediaStream | null = null
let analyser: AnalyserNode | null = null
let audioCtx: AudioContext | null = null
let dataArray: Uint8Array | null = null
// Dragging
let dragging = false
let dragStart = { mx: 0, my: 0, wx: 0, wy: 0 }
const formatDuration = (s: number): string => {
const m = Math.floor(s / 60)
return `${m}:${(s % 60).toString().padStart(2, '0')}`
}
const animateLevel = () => {
if (analyser && dataArray) {
analyser.getByteFrequencyData(dataArray)
// Sample 5 frequency bands
const bands = 5
const step = Math.floor(dataArray.length / bands)
levels = Array.from({ length: bands }, (_, i) => {
const val = dataArray![i * step] / 255
return Math.max(0.15, val)
})
} else {
levels = levels.map(() => 0.15 + Math.random() * 0.6)
}
animFrame = requestAnimationFrame(animateLevel)
}
const showError = (msg: string) => {
errorMsg = msg
if (errorTimer) clearTimeout(errorTimer)
errorTimer = setTimeout(() => {
errorMsg = ''
api?.close()
}, 3000)
}
const startRecording = async () => {
// Reset all state from any previous session
cleanup()
errorMsg = ''
transcribing = false
recording = true
duration = 0
audioChunks = []
animateLevel() // show placeholder bars immediately
// Wait for the start chime (played from main process) to finish
// before activating mic — macOS ducks audio when mic activates
await new Promise((r) => setTimeout(r, 500))
try {
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
audioChunks = []
// Set up analyser for real audio levels
audioCtx = new AudioContext()
analyser = audioCtx.createAnalyser()
analyser.fftSize = 64
dataArray = new Uint8Array(analyser.frequencyBinCount)
const source = audioCtx.createMediaStreamSource(mediaStream)
source.connect(analyser)
mediaRecorder = new MediaRecorder(mediaStream, {
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm'
})
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) audioChunks.push(e.data)
}
mediaRecorder.start(250)
timer = setInterval(() => { duration++ }, 1000)
} catch (err: any) {
showError(err?.message || 'Mic access failed')
}
}
const cleanup = () => {
recording = false
transcribing = false
if (timer) { clearInterval(timer); timer = null }
if (animFrame) { cancelAnimationFrame(animFrame); animFrame = null }
levels = Array(5).fill(0.15)
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop()
}
if (mediaStream) {
mediaStream.getTracks().forEach((t) => t.stop())
mediaStream = null
}
if (audioCtx) {
audioCtx.close()
audioCtx = null
analyser = null
}
mediaRecorder = null
}
const cancelRecording = () => {
cleanup()
api?.close()
}
const stopRecording = async () => {
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
cancelRecording()
return
}
// Too short — treat as cancel (less than 0.8 seconds)
if (duration < 1) {
cancelRecording()
return
}
recording = false
if (timer) { clearInterval(timer); timer = null }
if (animFrame) { cancelAnimationFrame(animFrame); animFrame = null }
levels = Array(5).fill(0.15)
const audioBlob = await new Promise<Blob>((resolve) => {
mediaRecorder!.onstop = () => {
resolve(new Blob(audioChunks, { type: mediaRecorder!.mimeType }))
}
mediaRecorder!.stop()
})
if (mediaStream) {
mediaStream.getTracks().forEach((t) => t.stop())
mediaStream = null
}
if (audioCtx) {
audioCtx.close()
audioCtx = null
analyser = null
}
if (audioBlob.size < 4096) {
api?.close()
return
}
transcribing = true
try {
const buffer = await audioBlob.arrayBuffer()
const result = await api?.transcribe(buffer)
const text = result?.text?.trim()
if (text) {
api?.done(text)
} else {
api?.close()
}
} catch (err: any) {
showError(err?.message || 'Transcription failed')
}
}
const onMouseDown = (e: MouseEvent) => {
dragging = true
dragStart = { mx: e.screenX, my: e.screenY, wx: window.screenX, wy: window.screenY }
}
const onMouseMove = (e: MouseEvent) => {
if (!dragging) return
window.moveTo(
dragStart.wx + (e.screenX - dragStart.mx),
dragStart.wy + (e.screenY - dragStart.my)
)
}
const onMouseUp = () => { dragging = false }
onMount(() => {
api?.onRecordingState((data) => {
if (data.recording && !recording) startRecording()
else if (!data.recording && recording) stopRecording()
})
})
onDestroy(() => {
cleanup()
if (errorTimer) clearTimeout(errorTimer)
})
</script>
<svelte:window
onkeydown={(e) => { if (e.key === 'Escape') cancelRecording() }}
onmousemove={onMouseMove}
onmouseup={onMouseUp}
/>
<!-- svelte-ignore a11y_no_static_element_interactions -->
<div class="pill" onmousedown={onMouseDown}>
{#if recording}
<div class="bars">
{#each levels as level}
<div class="bar" style="height: {6 + level * 22}px"></div>
{/each}
</div>
<span class="time">{formatDuration(duration)}</span>
{:else if transcribing}
<div class="loader"></div>
{:else if errorMsg}
<span class="err">{errorMsg}</span>
{/if}
</div>
<style>
@font-face {
font-family: 'Archivo';
src: url('../lib/assets/fonts/Archivo-Variable.ttf');
font-display: swap;
}
:global(*) { margin: 0; padding: 0; box-sizing: border-box; }
:global(html), :global(body), :global(#app) {
height: 100%; width: 100%;
background: transparent;
overflow: hidden;
user-select: none;
-webkit-font-smoothing: antialiased;
}
.pill {
position: absolute;
top: 50%; left: 50%;
transform: translate(-50%, -50%);
display: inline-flex;
align-items: center;
gap: 12px;
padding: 0 20px;
height: 44px;
border-radius: 22px;
cursor: grab;
font-family: 'Archivo', -apple-system, BlinkMacSystemFont, system-ui, sans-serif;
animation: appear 0.15s ease-out;
background: rgba(30, 30, 30, 0.78);
backdrop-filter: blur(40px) saturate(1.8);
-webkit-backdrop-filter: blur(40px) saturate(1.8);
border: 0.5px solid rgba(255, 255, 255, 0.12);
box-shadow:
0 2px 12px rgba(0, 0, 0, 0.35),
inset 0 0.5px 0 rgba(255, 255, 255, 0.06);
}
.pill:active { cursor: grabbing; }
@keyframes appear {
from { opacity: 0; transform: translate(-50%, -50%) scale(0.92); }
to { opacity: 1; transform: translate(-50%, -50%) scale(1); }
}
.bars {
display: flex;
align-items: center;
gap: 3px;
height: 28px;
}
.bar {
width: 4px;
border-radius: 99px;
background: #fff;
opacity: 0.9;
transition: height 60ms ease-out;
min-height: 6px;
}
.time {
font-size: 14px;
font-weight: 600;
font-variant-numeric: tabular-nums;
color: rgba(255, 255, 255, 0.85);
letter-spacing: 0.01em;
}
.loader {
width: 16px;
height: 16px;
border: 2px solid rgba(255, 255, 255, 0.15);
border-top-color: rgba(255, 255, 255, 0.8);
border-radius: 50%;
animation: spin 0.7s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.err {
font-size: 12px;
font-weight: 500;
color: #ff6b6b;
}
</style>
@@ -120,6 +120,13 @@
if (event.channel === 'webview:send') {
const requestData = event.args?.[0]
if (!requestData) return
// Handle auth token relay from webview
if (requestData.type === 'token:update' && requestData.token) {
window.electronAPI.setAuthToken?.(requestData.token)
return
}
try {
const response = await window.electronAPI[requestData.type]?.(requestData)
if (requestData._requestId) {
@@ -99,6 +99,12 @@
let spotlightRecording = $state(false)
let spotlightShortcutInputEl = $state<HTMLButtonElement | null>(null)
// Voice input shortcut recorder
let voiceInputShortcutValue = $state('')
let voiceInputRecording = $state(false)
let voiceInputShortcutInputEl = $state<HTMLButtonElement | null>(null)
let voiceInputEnabled = $state(true)
// Keep shortcut value in sync with config store
$effect(() => {
if ($config?.globalShortcut !== undefined) {
@@ -112,6 +118,15 @@
}
})
$effect(() => {
if ($config?.voiceInputShortcut !== undefined) {
voiceInputShortcutValue = $config.voiceInputShortcut ?? ''
}
if ($config?.voiceInputEnabled !== undefined) {
voiceInputEnabled = $config.voiceInputEnabled ?? true
}
})
const keyToElectron = (e: KeyboardEvent): string | null => {
const parts: string[] = []
if (e.metaKey || e.ctrlKey) parts.push('CommandOrControl')
@@ -122,16 +137,40 @@
const ignore = ['Control', 'Meta', 'Alt', 'Shift']
if (ignore.includes(e.key)) return null
// Map special keys
const keyMap: Record<string, string> = {
' ': 'Space',
// Use e.code to get the physical key (avoids macOS Alt producing unicode like √ for V)
const codeMap: Record<string, string> = {
Space: 'Space',
ArrowUp: 'Up',
ArrowDown: 'Down',
ArrowLeft: 'Left',
ArrowRight: 'Right',
Enter: 'Return'
Enter: 'Return',
Backquote: '`',
Minus: '-',
Equal: '=',
BracketLeft: '[',
BracketRight: ']',
Backslash: '\\',
Semicolon: ';',
Quote: "'",
Comma: ',',
Period: '.',
Slash: '/'
}
const key = keyMap[e.key] ?? (e.key.length === 1 ? e.key.toUpperCase() : e.key)
let key: string
if (codeMap[e.code]) {
key = codeMap[e.code]
} else if (e.code.startsWith('Key')) {
key = e.code.slice(3) // KeyA → A
} else if (e.code.startsWith('Digit')) {
key = e.code.slice(5) // Digit1 → 1
} else if (e.code.startsWith('F') && /^F\d+$/.test(e.code)) {
key = e.code // F1, F2, etc.
} else {
key = e.key.length === 1 ? e.key.toUpperCase() : e.key
}
parts.push(key)
return parts.join('+')
}
@@ -197,6 +236,32 @@
config.set(await window.electronAPI.getConfig())
}
}
const handleVoiceInputShortcutKeydown = async (e: KeyboardEvent) => {
e.preventDefault()
e.stopPropagation()
if (e.key === 'Escape') {
voiceInputRecording = false
return
}
if (e.key === 'Backspace' || e.key === 'Delete') {
voiceInputShortcutValue = ''
voiceInputRecording = false
await window.electronAPI.setConfig({ voiceInputShortcut: '' })
config.set(await window.electronAPI.getConfig())
return
}
const accel = keyToElectron(e)
if (accel) {
voiceInputShortcutValue = accel
voiceInputRecording = false
await window.electronAPI.setConfig({ voiceInputShortcut: accel })
config.set(await window.electronAPI.getConfig())
}
}
</script>
<div class="flex flex-col divide-y divide-white/[0.04]">
@@ -412,6 +477,78 @@
</div>
</div>
<div class="py-4 flex items-center justify-between">
<div>
<div class="text-[13px] opacity-70">Voice Input</div>
<div class="text-[11px] opacity-25 mt-0.5">Enable global push-to-talk voice transcription</div>
</div>
<Switch
checked={voiceInputEnabled}
label="Toggle voice input"
onchange={async (value) => {
voiceInputEnabled = value
await window.electronAPI.setConfig({ voiceInputEnabled: value })
config.set(await window.electronAPI.getConfig())
}}
/>
</div>
{#if voiceInputEnabled}
<div class="py-4 flex items-center justify-between">
<div>
<div class="text-[13px] opacity-70">Voice Input Shortcut</div>
<div class="text-[11px] opacity-25 mt-0.5">
{#if voiceInputRecording}
Press a key combination…
{:else}
Toggle microphone recording from anywhere
{/if}
</div>
</div>
<div class="flex items-center gap-1.5">
<button
bind:this={voiceInputShortcutInputEl}
class="text-[12px] px-3 py-1.5 border-none outline-none rounded-xl transition min-w-[80px] text-center
{voiceInputRecording
? 'bg-black/[0.08] dark:bg-white/[0.10] text-[#1d1d1f] dark:text-[#fafafa] opacity-80 animate-pulse'
: 'bg-black/[0.04] dark:bg-white/[0.06] text-[#1d1d1f] dark:text-[#fafafa] opacity-60 hover:opacity-80'}"
onclick={() => {
voiceInputRecording = true
voiceInputShortcutInputEl?.focus()
}}
onkeydown={(e) => {
if (voiceInputRecording) handleVoiceInputShortcutKeydown(e)
}}
onblur={() => {
voiceInputRecording = false
}}
>
{#if voiceInputRecording}
<span class="text-[11px]">Press keys…</span>
{:else if voiceInputShortcutValue}
{displayShortcut(voiceInputShortcutValue)}
{:else}
<span class="opacity-40">Disabled</span>
{/if}
</button>
{#if voiceInputShortcutValue && !voiceInputRecording}
<button
class="opacity-20 hover:opacity-50 transition bg-transparent border-none text-[#1d1d1f] dark:text-[#fafafa] p-0.5 shrink-0"
onclick={async () => {
voiceInputShortcutValue = ''
await window.electronAPI.setConfig({ voiceInputShortcut: '' })
config.set(await window.electronAPI.getConfig())
}}
>
<svg class="w-3 h-3" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="1.5">
<path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
{/if}
</div>
</div>
{/if}
<!-- Advanced (collapsed by default) -->
<div class="py-4">
<button
+8
View File
@@ -0,0 +1,8 @@
import { mount } from 'svelte'
import VoiceInput from './components/VoiceInput.svelte'
const app = mount(VoiceInput, {
target: document.getElementById('app')!
})
export default app
+15
View File
@@ -0,0 +1,15 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Open WebUI Voice Input</title>
<meta
http-equiv="Content-Security-Policy"
content="default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; connect-src 'self'"
/>
</head>
<body>
<div id="app"></div>
<script type="module" src="/src/voice-input-main.ts"></script>
</body>
</html>