feat: add global voice input with push-to-talk transcription (0.0.8)

2026-06-30 20:57:56 -04:00 · 2026-04-11 15:16:37 -06:00
parent 13dfb0f779
commit 4db0faff97
14 changed files with 864 additions and 13 deletions
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.0.8] - 2026-04-11
+
+### Added
+
+- **Voice Input.** System-wide push-to-talk voice transcription. Press the shortcut from any app to record audio, which is automatically transcribed and sent to your active chat.
+- **Voice Input Settings.** Configurable global hotkey and enable/disable toggle in Settings, with a default of Shift+Cmd+Space (macOS) or Shift+Ctrl+Space (Windows/Linux).
+- **Audio Feedback.** Bundled start and stop chime sounds play when recording begins and ends.
+
+### Fixed
+
+- **Shortcut Recorder on macOS.** Shortcut inputs now use physical key codes instead of character values, fixing Alt key combinations producing unicode characters like √ instead of V.
+
 ## [0.0.7] - 2026-04-11

 ### Fixed
@@ -15,7 +15,8 @@ export default defineConfig({
        input: {
          index: resolve(__dirname, 'src/preload/index.ts'),
          'content-preload': resolve(__dirname, 'src/preload/content-preload.ts'),
-          'spotlight-preload': resolve(__dirname, 'src/preload/spotlight-preload.ts')
+          'spotlight-preload': resolve(__dirname, 'src/preload/spotlight-preload.ts'),
+          'voice-input-preload': resolve(__dirname, 'src/preload/voice-input-preload.ts')
        }
      }
    }
@@ -25,7 +26,8 @@ export default defineConfig({
      rollupOptions: {
        input: {
          index: resolve(__dirname, 'src/renderer/index.html'),
-          spotlight: resolve(__dirname, 'src/renderer/spotlight.html')
+          spotlight: resolve(__dirname, 'src/renderer/spotlight.html'),
+          'voice-input': resolve(__dirname, 'src/renderer/voice-input.html')
        }
      }
    },
@@ -1,6 +1,6 @@
 {
  "name": "open-webui",
-  "version": "0.0.7",
+  "version": "0.0.8",
  "license": "AGPL-3.0",
  "description": "Open WebUI Desktop",
  "main": "./out/main/index.js",
@@ -98,6 +98,7 @@ if (process.platform === 'linux') {
 let mainWindow: BrowserWindow | null = null
 let contentWindow: BrowserWindow | null = null
 let spotlightWindow: BrowserWindow | null = null
+let voiceInputWindow: BrowserWindow | null = null
 let tray: Tray | null = null
 let isQuiting = false

@@ -106,10 +107,12 @@ let SERVER_URL: string | null = null
 let SERVER_STATUS: string | null = null
 let SERVER_REACHABLE = false
 let SERVER_PID: number | null = null
+let AUTH_TOKEN: string | null = null
+let voiceInputRecording = false

 // ─── Global Shortcuts ───────────────────────────────────

-const registerShortcuts = (globalAccel?: string, spotlightAccel?: string): void => {
+const registerShortcuts = (globalAccel?: string, spotlightAccel?: string, voiceInputAccel?: string): void => {
  globalShortcut.unregisterAll()

  // Global shortcut – bring main window to foreground
@@ -139,6 +142,20 @@ const registerShortcuts = (globalAccel?: string, spotlightAccel?: string): void
      log.warn('Failed to register spotlight shortcut:', spotlightAccel, error)
    }
  }
+
+  // Voice input shortcut – toggle microphone recording
+  if (voiceInputAccel && CONFIG?.voiceInputEnabled !== false) {
+    try {
+      const ok = globalShortcut.register(voiceInputAccel, () => {
+        toggleVoiceInput()
+      })
+      log.info(`Voice input shortcut "${voiceInputAccel}" registered: ${ok}`)
+    } catch (error) {
+      log.warn('Failed to register voice input shortcut:', voiceInputAccel, error)
+    }
+  } else {
+    log.info(`Voice input shortcut skipped — accel="${voiceInputAccel}", enabled=${CONFIG?.voiceInputEnabled}`)
+  }
 }

 // ─── Spotlight Window ───────────────────────────────────
@@ -257,6 +274,122 @@ function toggleSpotlight(selectedText?: string): void {
  }
 }

+// ─── Voice Input Window ─────────────────────────────────
+
+function createVoiceInputWindow(): BrowserWindow {
+  const { screen } = require('electron')
+  const cursorPoint = screen.getCursorScreenPoint()
+  const activeDisplay = screen.getDisplayNearestPoint(cursorPoint)
+  const { x: sx, y: sy, width: sw } = activeDisplay.bounds
+
+  const winW = 340
+  const winH = 72
+
+  voiceInputWindow = new BrowserWindow({
+    x: sx + Math.round((sw - winW) / 2),
+    y: sy + 120,
+    width: winW,
+    height: winH,
+    frame: false,
+    transparent: true,
+    alwaysOnTop: true,
+    skipTaskbar: true,
+    resizable: false,
+    hasShadow: false,
+    show: false,
+    focusable: true,
+    icon: path.join(__dirname, 'assets/icon.png'),
+    webPreferences: {
+      preload: join(__dirname, '../preload/voice-input-preload.js'),
+      sandbox: false,
+      webviewTag: false,
+      autoplayPolicy: 'no-user-gesture-required'
+    }
+  })
+
+  // Grant microphone permission for the voice input window
+  voiceInputWindow.webContents.session.setPermissionRequestHandler(
+    (_webContents, permission, callback) => {
+      callback(permission === 'media')
+    }
+  )
+
+  if (is.dev && process.env['ELECTRON_RENDERER_URL']) {
+    voiceInputWindow.loadURL(`${process.env['ELECTRON_RENDERER_URL']}/voice-input.html`)
+  } else {
+    voiceInputWindow.loadFile(join(__dirname, '../renderer/voice-input.html'))
+  }
+
+  voiceInputWindow.on('closed', () => {
+    voiceInputWindow = null
+    voiceInputRecording = false
+  })
+
+  return voiceInputWindow
+}
+
+function playChime(ascending: boolean): Promise<void> {
+  return new Promise((resolve) => {
+    const { execFile } = require('child_process')
+    const fs = require('fs')
+    const file = ascending ? 'chime-start.wav' : 'chime-stop.wav'
+    const soundPath = app.isPackaged
+      ? join(process.resourcesPath, 'app.asar.unpacked', 'resources', 'sounds', file)
+      : join(app.getAppPath(), 'resources', 'sounds', file)
+
+    const exists = fs.existsSync(soundPath)
+    log.info(`playChime: ${ascending ? 'start' : 'stop'}, path=${soundPath}, exists=${exists}`)
+
+    if (!exists) { resolve(); return }
+
+    if (process.platform === 'darwin') {
+      execFile('afplay', [soundPath], (err, stdout, stderr) => {
+        if (err) log.warn('afplay error:', err.message, stderr)
+        resolve()
+      })
+    } else if (process.platform === 'win32') {
+      execFile('powershell', ['-NoProfile', '-Command',
+        `(New-Object Media.SoundPlayer '${soundPath}').PlaySync()`
+      ], () => resolve())
+    } else {
+      execFile('paplay', [soundPath], (err) => {
+        if (err) execFile('aplay', [soundPath], () => resolve())
+        else resolve()
+      })
+    }
+  })
+}
+
+async function toggleVoiceInput(): Promise<void> {
+  if (voiceInputRecording) {
+    // Stop recording — chime plays in done/close handler after mic is released
+    voiceInputRecording = false
+    if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
+      voiceInputWindow.webContents.send('voiceInput:state', { recording: false })
+    }
+    return
+  }
+
+  // Start recording — chime plays concurrently (separate audio output path from mic input)
+  voiceInputRecording = true
+  playChime(true)
+
+  if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
+    voiceInputWindow.show()
+    voiceInputWindow.focus()
+    voiceInputWindow.webContents.send('voiceInput:state', { recording: true })
+  } else {
+    const win = createVoiceInputWindow()
+    win.once('ready-to-show', () => {
+      win.show()
+      win.focus()
+      setTimeout(() => {
+        win.webContents.send('voiceInput:state', { recording: true })
+      }, 100)
+    })
+  }
+}
+
 // ─── Windows ────────────────────────────────────────────

 function createMainWindow(show = true): void {
@@ -790,7 +923,8 @@ if (!gotTheLock) {
      await setConfig(config)
      CONFIG = await getConfig()
      updateTray()
-      registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut)
+      voiceInputRecording = false
+      registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut, CONFIG.voiceInputShortcut)
    })

    // Python/uv
@@ -941,6 +1075,12 @@ if (!gotTheLock) {
      }
    })

+    // Auth token relay from webview
+    ipcMain.handle('app:setAuthToken', (_event, token: string) => {
+      AUTH_TOKEN = token || null
+      log.info('Auth token updated from webview')
+    })
+
    // Misc
    ipcMain.handle('app:reset', () => resetAppHandler())

@@ -1076,6 +1216,162 @@ if (!gotTheLock) {
      }
    )

+    // ── Voice Input ─────────────────────────────────────
+
+    // Check microphone permission (macOS)
+    ipcMain.handle('voiceInput:micPermission', async () => {
+      if (process.platform === 'darwin') {
+        const status = systemPreferences.getMediaAccessStatus('microphone')
+        if (status !== 'granted') {
+          const granted = await systemPreferences.askForMediaAccess('microphone')
+          return granted ? 'granted' : 'denied'
+        }
+        return 'granted'
+      }
+      return 'granted' // Windows/Linux don't need explicit permission
+    })
+
+    // Transcribe audio via the connected server's STT endpoint
+    ipcMain.handle('voiceInput:transcribe', async (_event, audioBuffer: ArrayBuffer, rendererToken?: string) => {
+      try {
+        const config = await getConfig()
+        if (!config.defaultConnectionId || config.connections.length === 0) {
+          throw new Error('No connection configured')
+        }
+        const conn = config.connections.find((c) => c.id === config.defaultConnectionId)
+        if (!conn) throw new Error('Default connection not found')
+
+        let url = conn.url
+        if (conn.type === 'local' && SERVER_URL) {
+          url = SERVER_URL
+        }
+        if (url.startsWith('http://0.0.0.0')) {
+          url = url.replace('http://0.0.0.0', 'http://localhost')
+        }
+
+        // Use stored auth token (relayed from webview), fall back to renderer-provided or contentWindow
+        let token = AUTH_TOKEN || rendererToken || ''
+        if (!token) {
+          // Scan all webContents to find the Open WebUI webview and read its token
+          try {
+            const { webContents: wc } = require('electron')
+            const allContents = wc.getAllWebContents()
+            for (const contents of allContents) {
+              try {
+                if (contents.getType() === 'webview' && !contents.isDestroyed()) {
+                  const t = await contents.executeJavaScript(
+                    `localStorage.getItem('token') || ''`
+                  )
+                  if (t) { token = t; break }
+                }
+              } catch {
+                // Skip inaccessible webContents
+              }
+            }
+          } catch {
+            log.warn('voiceInput:transcribe — could not extract token from webviews')
+          }
+        }
+
+        if (!token) {
+          throw new Error('Not authenticated — open a connection first')
+        }
+
+        // Build multipart form data manually using Node.js
+        const boundary = '----VoiceInput' + Date.now()
+        const buffer = Buffer.from(audioBuffer)
+        const filename = `recording-${Date.now()}.wav`
+
+        const header = [
+          `--${boundary}`,
+          `Content-Disposition: form-data; name="file"; filename="${filename}"`,
+          `Content-Type: audio/wav`,
+          '',
+          ''
+        ].join('\r\n')
+
+        const footer = `\r\n--${boundary}--\r\n`
+        const headerBuf = Buffer.from(header, 'utf-8')
+        const footerBuf = Buffer.from(footer, 'utf-8')
+        const body = Buffer.concat([headerBuf, buffer, footerBuf])
+
+        const response = await fetch(`${url}/api/v1/audio/transcriptions`, {
+          method: 'POST',
+          headers: {
+            'Authorization': `Bearer ${token}`,
+            'Content-Type': `multipart/form-data; boundary=${boundary}`
+          },
+          body
+        })
+
+        if (!response.ok) {
+          const text = await response.text().catch(() => '')
+          throw new Error(`Transcription failed (${response.status}): ${text}`)
+        }
+
+        const result = await response.json()
+        return result
+      } catch (error: any) {
+        log.error('voiceInput:transcribe failed:', error)
+        throw error
+      }
+    })
+
+    // Voice input completed — deliver text to chat
+    ipcMain.handle('voiceInput:done', async (_event, text: string) => {
+      voiceInputRecording = false
+      playChime(false)
+      if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
+        voiceInputWindow.hide()
+      }
+
+      if (!text?.trim()) return
+
+      // Deliver text through the same path as Spotlight
+      const config = await getConfig()
+      if (!config.defaultConnectionId || config.connections.length === 0) {
+        mainWindow?.show()
+        mainWindow?.focus()
+        return
+      }
+      const conn = config.connections.find((c) => c.id === config.defaultConnectionId)
+      if (!conn) {
+        mainWindow?.show()
+        mainWindow?.focus()
+        return
+      }
+
+      let url = conn.url
+      if (conn.type === 'local' && SERVER_URL) {
+        url = SERVER_URL
+      }
+      if (url.startsWith('http://0.0.0.0')) {
+        url = url.replace('http://0.0.0.0', 'http://localhost')
+      }
+
+      sendToRenderer('query', { query: text.trim(), connectionId: conn.id, url })
+
+      if (mainWindow && !mainWindow.isDestroyed()) {
+        mainWindow.show()
+        mainWindow.focus()
+      }
+    })
+
+    // Voice input window requests close
+    ipcMain.handle('voiceInput:close', () => {
+      voiceInputRecording = false
+      playChime(false)
+      if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
+        voiceInputWindow.hide()
+      }
+    })
+
+    // Voice input error
+    ipcMain.handle('voiceInput:error', (_event, message: string) => {
+      log.warn('Voice input error:', message)
+      voiceInputRecording = false
+    })
+
    // Open Terminal
    ipcMain.handle('open-terminal:start', async () => {
      try {
@@ -1331,7 +1627,7 @@ if (!gotTheLock) {


    // Global shortcut
-    registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut)
+    registerShortcuts(CONFIG.globalShortcut, CONFIG.spotlightShortcut, CONFIG.voiceInputShortcut)

    // Enable screen capture
    session.defaultSession.setDisplayMediaRequestHandler(
@@ -1423,6 +1719,10 @@ if (!gotTheLock) {
      spotlightWindow.destroy()
    }
    spotlightWindow = null
+    if (voiceInputWindow && !voiceInputWindow.isDestroyed()) {
+      voiceInputWindow.destroy()
+    }
+    voiceInputWindow = null
    tray?.destroy()
    tray = null
  })
@@ -828,6 +828,8 @@ export interface AppConfig {
  envVars: Record<string, string>
  showSidebar: boolean
  spotlightPosition: { x: number; y: number } | null
+  voiceInputShortcut: string
+  voiceInputEnabled: boolean
 }

 const DEFAULT_CONFIG: AppConfig = {
@@ -856,7 +858,9 @@ const DEFAULT_CONFIG: AppConfig = {
  },
  envVars: {},
  showSidebar: false,
-  spotlightPosition: null
+  spotlightPosition: null,
+  voiceInputShortcut: 'Shift+CommandOrControl+Space',
+  voiceInputEnabled: true
 }

 export const getConfig = async (): Promise<AppConfig> => {
@@ -181,7 +181,10 @@ const api = {
  installUpdate: () => ipcRenderer.invoke('updater:install'),

  // Changelog
-  getChangelog: () => ipcRenderer.invoke('app:changelog')
+  getChangelog: () => ipcRenderer.invoke('app:changelog'),
+
+  // Auth token relay from webview
+  setAuthToken: (token: string) => ipcRenderer.invoke('app:setAuthToken', token)
 }

 if (process.contextIsolated) {
@@ -0,0 +1,43 @@
+import { ipcRenderer, contextBridge } from 'electron'
+
+const api = {
+  // Main process tells us to start/stop recording
+  onRecordingState: (
+    callback: (data: { recording: boolean }) => void
+  ): void => {
+    ipcRenderer.on('voiceInput:state', (_event, data) => {
+      callback(data)
+    })
+  },
+
+  // Send recorded audio to main process for transcription
+  transcribe: (audioBuffer: ArrayBuffer, token?: string): Promise<any> => {
+    return ipcRenderer.invoke('voiceInput:transcribe', audioBuffer, token)
+  },
+
+  // Notify main process that transcription completed
+  done: (text: string): void => {
+    ipcRenderer.invoke('voiceInput:done', text)
+  },
+
+  // Close/hide the voice input window
+  close: (): void => {
+    ipcRenderer.invoke('voiceInput:close')
+  },
+
+  // Report an error
+  error: (message: string): void => {
+    ipcRenderer.invoke('voiceInput:error', message)
+  }
+}
+
+if (process.contextIsolated) {
+  try {
+    contextBridge.exposeInMainWorld('voiceInputAPI', api)
+  } catch (error) {
+    console.error(error)
+  }
+} else {
+  // @ts-ignore
+  window.voiceInputAPI = api
+}
@@ -0,0 +1,320 @@
+<script lang="ts">
+  import { onMount, onDestroy } from 'svelte'
+
+  const api = window.voiceInputAPI
+
+  let recording = $state(false)
+  let transcribing = $state(false)
+  let duration = $state(0)
+  let errorMsg = $state('')
+
+  // Waveform
+  let levels: number[] = $state(Array(5).fill(0.15))
+  let animFrame: number | null = null
+
+  let timer: ReturnType<typeof setInterval> | null = null
+  let errorTimer: ReturnType<typeof setTimeout> | null = null
+
+  // Audio
+  let mediaRecorder: MediaRecorder | null = null
+  let audioChunks: Blob[] = []
+  let mediaStream: MediaStream | null = null
+  let analyser: AnalyserNode | null = null
+  let audioCtx: AudioContext | null = null
+  let dataArray: Uint8Array | null = null
+
+  // Dragging
+  let dragging = false
+  let dragStart = { mx: 0, my: 0, wx: 0, wy: 0 }
+
+  const formatDuration = (s: number): string => {
+    const m = Math.floor(s / 60)
+    return `${m}:${(s % 60).toString().padStart(2, '0')}`
+  }
+
+  const animateLevel = () => {
+    if (analyser && dataArray) {
+      analyser.getByteFrequencyData(dataArray)
+      // Sample 5 frequency bands
+      const bands = 5
+      const step = Math.floor(dataArray.length / bands)
+      levels = Array.from({ length: bands }, (_, i) => {
+        const val = dataArray![i * step] / 255
+        return Math.max(0.15, val)
+      })
+    } else {
+      levels = levels.map(() => 0.15 + Math.random() * 0.6)
+    }
+    animFrame = requestAnimationFrame(animateLevel)
+  }
+
+  const showError = (msg: string) => {
+    errorMsg = msg
+    if (errorTimer) clearTimeout(errorTimer)
+    errorTimer = setTimeout(() => {
+      errorMsg = ''
+      api?.close()
+    }, 3000)
+  }
+
+  const startRecording = async () => {
+    // Reset all state from any previous session
+    cleanup()
+    errorMsg = ''
+    transcribing = false
+    recording = true
+    duration = 0
+    audioChunks = []
+    animateLevel() // show placeholder bars immediately
+
+    // Wait for the start chime (played from main process) to finish
+    // before activating mic — macOS ducks audio when mic activates
+    await new Promise((r) => setTimeout(r, 500))
+
+    try {
+      mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+      audioChunks = []
+
+      // Set up analyser for real audio levels
+      audioCtx = new AudioContext()
+      analyser = audioCtx.createAnalyser()
+      analyser.fftSize = 64
+      dataArray = new Uint8Array(analyser.frequencyBinCount)
+      const source = audioCtx.createMediaStreamSource(mediaStream)
+      source.connect(analyser)
+
+      mediaRecorder = new MediaRecorder(mediaStream, {
+        mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
+          ? 'audio/webm;codecs=opus'
+          : 'audio/webm'
+      })
+
+      mediaRecorder.ondataavailable = (e) => {
+        if (e.data.size > 0) audioChunks.push(e.data)
+      }
+
+      mediaRecorder.start(250)
+      timer = setInterval(() => { duration++ }, 1000)
+    } catch (err: any) {
+      showError(err?.message || 'Mic access failed')
+    }
+  }
+
+  const cleanup = () => {
+    recording = false
+    transcribing = false
+    if (timer) { clearInterval(timer); timer = null }
+    if (animFrame) { cancelAnimationFrame(animFrame); animFrame = null }
+    levels = Array(5).fill(0.15)
+    if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+      mediaRecorder.stop()
+    }
+    if (mediaStream) {
+      mediaStream.getTracks().forEach((t) => t.stop())
+      mediaStream = null
+    }
+    if (audioCtx) {
+      audioCtx.close()
+      audioCtx = null
+      analyser = null
+    }
+    mediaRecorder = null
+  }
+
+  const cancelRecording = () => {
+    cleanup()
+    api?.close()
+  }
+
+  const stopRecording = async () => {
+    if (!mediaRecorder || mediaRecorder.state === 'inactive') {
+      cancelRecording()
+      return
+    }
+
+    // Too short — treat as cancel (less than 0.8 seconds)
+    if (duration < 1) {
+      cancelRecording()
+      return
+    }
+
+    recording = false
+    if (timer) { clearInterval(timer); timer = null }
+    if (animFrame) { cancelAnimationFrame(animFrame); animFrame = null }
+    levels = Array(5).fill(0.15)
+
+    const audioBlob = await new Promise<Blob>((resolve) => {
+      mediaRecorder!.onstop = () => {
+        resolve(new Blob(audioChunks, { type: mediaRecorder!.mimeType }))
+      }
+      mediaRecorder!.stop()
+    })
+
+    if (mediaStream) {
+      mediaStream.getTracks().forEach((t) => t.stop())
+      mediaStream = null
+    }
+    if (audioCtx) {
+      audioCtx.close()
+      audioCtx = null
+      analyser = null
+    }
+
+    if (audioBlob.size < 4096) {
+      api?.close()
+      return
+    }
+
+    transcribing = true
+    try {
+      const buffer = await audioBlob.arrayBuffer()
+      const result = await api?.transcribe(buffer)
+      const text = result?.text?.trim()
+      if (text) {
+        api?.done(text)
+      } else {
+        api?.close()
+      }
+    } catch (err: any) {
+      showError(err?.message || 'Transcription failed')
+    }
+  }
+
+  const onMouseDown = (e: MouseEvent) => {
+    dragging = true
+    dragStart = { mx: e.screenX, my: e.screenY, wx: window.screenX, wy: window.screenY }
+  }
+
+  const onMouseMove = (e: MouseEvent) => {
+    if (!dragging) return
+    window.moveTo(
+      dragStart.wx + (e.screenX - dragStart.mx),
+      dragStart.wy + (e.screenY - dragStart.my)
+    )
+  }
+
+  const onMouseUp = () => { dragging = false }
+
+  onMount(() => {
+    api?.onRecordingState((data) => {
+      if (data.recording && !recording) startRecording()
+      else if (!data.recording && recording) stopRecording()
+    })
+  })
+
+  onDestroy(() => {
+    cleanup()
+    if (errorTimer) clearTimeout(errorTimer)
+  })
+</script>
+
+<svelte:window
+  onkeydown={(e) => { if (e.key === 'Escape') cancelRecording() }}
+  onmousemove={onMouseMove}
+  onmouseup={onMouseUp}
+/>
+
+<!-- svelte-ignore a11y_no_static_element_interactions -->
+<div class="pill" onmousedown={onMouseDown}>
+  {#if recording}
+    <div class="bars">
+      {#each levels as level}
+        <div class="bar" style="height: {6 + level * 22}px"></div>
+      {/each}
+    </div>
+    <span class="time">{formatDuration(duration)}</span>
+  {:else if transcribing}
+    <div class="loader"></div>
+  {:else if errorMsg}
+    <span class="err">{errorMsg}</span>
+  {/if}
+</div>
+
+<style>
+  @font-face {
+    font-family: 'Archivo';
+    src: url('../lib/assets/fonts/Archivo-Variable.ttf');
+    font-display: swap;
+  }
+  :global(*) { margin: 0; padding: 0; box-sizing: border-box; }
+  :global(html), :global(body), :global(#app) {
+    height: 100%; width: 100%;
+    background: transparent;
+    overflow: hidden;
+    user-select: none;
+    -webkit-font-smoothing: antialiased;
+  }
+
+  .pill {
+    position: absolute;
+    top: 50%; left: 50%;
+    transform: translate(-50%, -50%);
+    display: inline-flex;
+    align-items: center;
+    gap: 12px;
+    padding: 0 20px;
+    height: 44px;
+    border-radius: 22px;
+    cursor: grab;
+    font-family: 'Archivo', -apple-system, BlinkMacSystemFont, system-ui, sans-serif;
+    animation: appear 0.15s ease-out;
+
+    background: rgba(30, 30, 30, 0.78);
+    backdrop-filter: blur(40px) saturate(1.8);
+    -webkit-backdrop-filter: blur(40px) saturate(1.8);
+    border: 0.5px solid rgba(255, 255, 255, 0.12);
+    box-shadow:
+      0 2px 12px rgba(0, 0, 0, 0.35),
+      inset 0 0.5px 0 rgba(255, 255, 255, 0.06);
+  }
+
+  .pill:active { cursor: grabbing; }
+
+  @keyframes appear {
+    from { opacity: 0; transform: translate(-50%, -50%) scale(0.92); }
+    to   { opacity: 1; transform: translate(-50%, -50%) scale(1); }
+  }
+
+  .bars {
+    display: flex;
+    align-items: center;
+    gap: 3px;
+    height: 28px;
+  }
+
+  .bar {
+    width: 4px;
+    border-radius: 99px;
+    background: #fff;
+    opacity: 0.9;
+    transition: height 60ms ease-out;
+    min-height: 6px;
+  }
+
+  .time {
+    font-size: 14px;
+    font-weight: 600;
+    font-variant-numeric: tabular-nums;
+    color: rgba(255, 255, 255, 0.85);
+    letter-spacing: 0.01em;
+  }
+
+  .loader {
+    width: 16px;
+    height: 16px;
+    border: 2px solid rgba(255, 255, 255, 0.15);
+    border-top-color: rgba(255, 255, 255, 0.8);
+    border-radius: 50%;
+    animation: spin 0.7s linear infinite;
+  }
+
+  @keyframes spin {
+    to { transform: rotate(360deg); }
+  }
+
+  .err {
+    font-size: 12px;
+    font-weight: 500;
+    color: #ff6b6b;
+  }
+</style>
@@ -120,6 +120,13 @@
          if (event.channel === 'webview:send') {
            const requestData = event.args?.[0]
            if (!requestData) return
+
+            // Handle auth token relay from webview
+            if (requestData.type === 'token:update' && requestData.token) {
+              window.electronAPI.setAuthToken?.(requestData.token)
+              return
+            }
+
            try {
              const response = await window.electronAPI[requestData.type]?.(requestData)
              if (requestData._requestId) {
@@ -99,6 +99,12 @@
  let spotlightRecording = $state(false)
  let spotlightShortcutInputEl = $state<HTMLButtonElement | null>(null)

+  // Voice input shortcut recorder
+  let voiceInputShortcutValue = $state('')
+  let voiceInputRecording = $state(false)
+  let voiceInputShortcutInputEl = $state<HTMLButtonElement | null>(null)
+  let voiceInputEnabled = $state(true)
+
  // Keep shortcut value in sync with config store
  $effect(() => {
    if ($config?.globalShortcut !== undefined) {
@@ -112,6 +118,15 @@
    }
  })

+  $effect(() => {
+    if ($config?.voiceInputShortcut !== undefined) {
+      voiceInputShortcutValue = $config.voiceInputShortcut ?? ''
+    }
+    if ($config?.voiceInputEnabled !== undefined) {
+      voiceInputEnabled = $config.voiceInputEnabled ?? true
+    }
+  })
+
  const keyToElectron = (e: KeyboardEvent): string | null => {
    const parts: string[] = []
    if (e.metaKey || e.ctrlKey) parts.push('CommandOrControl')
@@ -122,16 +137,40 @@
    const ignore = ['Control', 'Meta', 'Alt', 'Shift']
    if (ignore.includes(e.key)) return null

-    // Map special keys
-    const keyMap: Record<string, string> = {
-      ' ': 'Space',
+    // Use e.code to get the physical key (avoids macOS Alt producing unicode like √ for V)
+    const codeMap: Record<string, string> = {
+      Space: 'Space',
      ArrowUp: 'Up',
      ArrowDown: 'Down',
      ArrowLeft: 'Left',
      ArrowRight: 'Right',
-      Enter: 'Return'
+      Enter: 'Return',
+      Backquote: '`',
+      Minus: '-',
+      Equal: '=',
+      BracketLeft: '[',
+      BracketRight: ']',
+      Backslash: '\\',
+      Semicolon: ';',
+      Quote: "'",
+      Comma: ',',
+      Period: '.',
+      Slash: '/'
    }
-    const key = keyMap[e.key] ?? (e.key.length === 1 ? e.key.toUpperCase() : e.key)
+
+    let key: string
+    if (codeMap[e.code]) {
+      key = codeMap[e.code]
+    } else if (e.code.startsWith('Key')) {
+      key = e.code.slice(3) // KeyA → A
+    } else if (e.code.startsWith('Digit')) {
+      key = e.code.slice(5) // Digit1 → 1
+    } else if (e.code.startsWith('F') && /^F\d+$/.test(e.code)) {
+      key = e.code // F1, F2, etc.
+    } else {
+      key = e.key.length === 1 ? e.key.toUpperCase() : e.key
+    }
+
    parts.push(key)
    return parts.join('+')
  }
@@ -197,6 +236,32 @@
      config.set(await window.electronAPI.getConfig())
    }
  }
+
+  const handleVoiceInputShortcutKeydown = async (e: KeyboardEvent) => {
+    e.preventDefault()
+    e.stopPropagation()
+
+    if (e.key === 'Escape') {
+      voiceInputRecording = false
+      return
+    }
+
+    if (e.key === 'Backspace' || e.key === 'Delete') {
+      voiceInputShortcutValue = ''
+      voiceInputRecording = false
+      await window.electronAPI.setConfig({ voiceInputShortcut: '' })
+      config.set(await window.electronAPI.getConfig())
+      return
+    }
+
+    const accel = keyToElectron(e)
+    if (accel) {
+      voiceInputShortcutValue = accel
+      voiceInputRecording = false
+      await window.electronAPI.setConfig({ voiceInputShortcut: accel })
+      config.set(await window.electronAPI.getConfig())
+    }
+  }
 </script>

 <div class="flex flex-col divide-y divide-white/[0.04]">
@@ -412,6 +477,78 @@
    </div>
  </div>

+  <div class="py-4 flex items-center justify-between">
+    <div>
+      <div class="text-[13px] opacity-70">Voice Input</div>
+      <div class="text-[11px] opacity-25 mt-0.5">Enable global push-to-talk voice transcription</div>
+    </div>
+    <Switch
+      checked={voiceInputEnabled}
+      label="Toggle voice input"
+      onchange={async (value) => {
+        voiceInputEnabled = value
+        await window.electronAPI.setConfig({ voiceInputEnabled: value })
+        config.set(await window.electronAPI.getConfig())
+      }}
+    />
+  </div>
+
+  {#if voiceInputEnabled}
+  <div class="py-4 flex items-center justify-between">
+    <div>
+      <div class="text-[13px] opacity-70">Voice Input Shortcut</div>
+      <div class="text-[11px] opacity-25 mt-0.5">
+        {#if voiceInputRecording}
+          Press a key combination…
+        {:else}
+          Toggle microphone recording from anywhere
+        {/if}
+      </div>
+    </div>
+    <div class="flex items-center gap-1.5">
+      <button
+        bind:this={voiceInputShortcutInputEl}
+        class="text-[12px] px-3 py-1.5 border-none outline-none rounded-xl transition min-w-[80px] text-center
+          {voiceInputRecording
+            ? 'bg-black/[0.08] dark:bg-white/[0.10] text-[#1d1d1f] dark:text-[#fafafa] opacity-80 animate-pulse'
+            : 'bg-black/[0.04] dark:bg-white/[0.06] text-[#1d1d1f] dark:text-[#fafafa] opacity-60 hover:opacity-80'}"
+        onclick={() => {
+          voiceInputRecording = true
+          voiceInputShortcutInputEl?.focus()
+        }}
+        onkeydown={(e) => {
+          if (voiceInputRecording) handleVoiceInputShortcutKeydown(e)
+        }}
+        onblur={() => {
+          voiceInputRecording = false
+        }}
+      >
+        {#if voiceInputRecording}
+          <span class="text-[11px]">Press keys…</span>
+        {:else if voiceInputShortcutValue}
+          {displayShortcut(voiceInputShortcutValue)}
+        {:else}
+          <span class="opacity-40">Disabled</span>
+        {/if}
+      </button>
+      {#if voiceInputShortcutValue && !voiceInputRecording}
+        <button
+          class="opacity-20 hover:opacity-50 transition bg-transparent border-none text-[#1d1d1f] dark:text-[#fafafa] p-0.5 shrink-0"
+          onclick={async () => {
+            voiceInputShortcutValue = ''
+            await window.electronAPI.setConfig({ voiceInputShortcut: '' })
+            config.set(await window.electronAPI.getConfig())
+          }}
+        >
+          <svg class="w-3 h-3" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="1.5">
+            <path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
+          </svg>
+        </button>
+      {/if}
+    </div>
+  </div>
+  {/if}
+
  <!-- Advanced (collapsed by default) -->
  <div class="py-4">
    <button
@@ -0,0 +1,8 @@
+import { mount } from 'svelte'
+import VoiceInput from './components/VoiceInput.svelte'
+
+const app = mount(VoiceInput, {
+  target: document.getElementById('app')!
+})
+
+export default app
@@ -0,0 +1,15 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <title>Open WebUI – Voice Input</title>
+    <meta
+      http-equiv="Content-Security-Policy"
+      content="default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; connect-src 'self'"
+    />
+  </head>
+  <body>
+    <div id="app"></div>
+    <script type="module" src="/src/voice-input-main.ts"></script>
+  </body>
+</html>