voice-state-machine

Installation
SKILL.md

When to Use

  • Implementing always-listening voice assistant
  • Managing voice recognition state (listening, processing, speaking)
  • Coordinating wake word detection with command processing
  • Handling voice errors and recovery

State Machine Overview

┌─────────────────────────────────────────────────────────────────┐
│                      VOICE STATE MACHINE                        │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  ┌──────────┐   START    ┌────────────┐   VOICE    ┌────────┐ │
│  │   IDLE   │───────────▶│ LISTENING  │──DETECTED─▶│PROCESS │ │
│  └──────────┘            └────────────┘            └────────┘ │
│       ▲                        │                       │       │
│       │                        │                       │       │
│       │ STOP              TIMEOUT/                COMMAND      │
│       │                    ERROR                   READY       │
│       │                        │                       │       │
│       │                        ▼                       ▼       │
│       │                  ┌──────────┐           ┌──────────┐  │
│       └──────────────────│  ERROR   │           │ SPEAKING │  │
│       │                  └──────────┘           └──────────┘  │
│       │                        │                       │       │
│       │                    RETRY                    DONE       │
│       │                        │                       │       │
│       │                        ▼                       │       │
│       │                  ┌──────────┐                  │       │
│       └──────────────────│RECOVERING│◀─────────────────┘       │
│                          └──────────┘                          │
│                                │                               │
│                           RECOVERED                            │
│                                │                               │
│                                ▼                               │
│                          (LISTENING)                           │
└─────────────────────────────────────────────────────────────────┘

Machine Implementation

// machines/voiceMachine.ts
import { setup, assign, fromCallback, fromPromise, sendTo } from 'xstate';

// Types
interface VoiceContext {
  transcript: string;
  command: VoiceCommand | null;
  lastDescription: string | null;
  error: string | null;
  retryCount: number;
}

type VoiceEvent =
  | { type: 'START' }
  | { type: 'STOP' }
  | { type: 'VOICE_DETECTED'; transcript: string; confidence: number }
  | { type: 'WAKE_WORD_DETECTED'; command: string }
  | { type: 'COMMAND_PROCESSED'; description: string }
  | { type: 'SPEECH_DONE' }
  | { type: 'ERROR'; error: string }
  | { type: 'RETRY' };

// Machine
export const voiceMachine = setup({
  types: {
    context: {} as VoiceContext,
    events: {} as VoiceEvent,
  },

  actors: {
    // Whisper + VAD listener
    whisperListener: fromCallback(({ sendBack, receive }) => {
      let transcriber: RealtimeTranscriber | null = null;

      const start = async () => {
        transcriber = new RealtimeTranscriber(/* config */);
        
        transcriber.onTranscribe((result) => {
          sendBack({
            type: 'VOICE_DETECTED',
            transcript: result.text,
            confidence: result.confidence,
          });
        });

        transcriber.onError((error) => {
          sendBack({ type: 'ERROR', error: error.message });
        });

        await transcriber.start();
      };

      start();

      receive((event) => {
        if (event.type === 'PAUSE') {
          transcriber?.stop();
        }
        if (event.type === 'RESUME') {
          start();
        }
      });

      return () => {
        transcriber?.stop();
      };
    }),

    // Text-to-speech actor
    speechSynthesizer: fromPromise<void, { text: string; language: string }>(
      async ({ input }) => {
        await Speech.speak(input.text, {
          language: input.language,
          rate: 0.85,
          pitch: 1.1,
          onDone: () => {},
        });
      }
    ),

    // Command processor
    commandProcessor: fromPromise<
      { description: string },
      { command: VoiceCommand }
    >(async ({ input }) => {
      const { command } = input;
      // Process command through use case
      const result = await processCommandUseCase.execute(command);
      return { description: result.description };
    }),
  },

  actions: {
    setTranscript: assign({
      transcript: ({ event }) =>
        event.type === 'VOICE_DETECTED' ? event.transcript : '',
    }),

    parseCommand: assign({
      command: ({ context }) => parseWakeWord(context.transcript),
    }),

    setDescription: assign({
      lastDescription: ({ event }) =>
        event.type === 'COMMAND_PROCESSED' ? event.description : null,
    }),

    setError: assign({
      error: ({ event }) => (event.type === 'ERROR' ? event.error : null),
    }),

    clearError: assign({
      error: null,
      retryCount: 0,
    }),

    incrementRetry: assign({
      retryCount: ({ context }) => context.retryCount + 1,
    }),

    pauseListener: sendTo('whisperListener', { type: 'PAUSE' }),
    resumeListener: sendTo('whisperListener', { type: 'RESUME' }),

    // Accessibility feedback
    announceListening: () => {
      Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Light);
    },
    announceError: () => {
      Haptics.notificationAsync(Haptics.NotificationFeedbackType.Error);
    },
  },

  guards: {
    hasWakeWord: ({ context }) => {
      return context.command !== null;
    },
    canRetry: ({ context }) => {
      return context.retryCount < 3;
    },
    isValidConfidence: ({ event }) => {
      return event.type === 'VOICE_DETECTED' && event.confidence > 0.7;
    },
  },
}).createMachine({
  id: 'voice',
  initial: 'idle',
  context: {
    transcript: '',
    command: null,
    lastDescription: null,
    error: null,
    retryCount: 0,
  },

  states: {
    idle: {
      on: {
        START: { target: 'listening' },
      },
    },

    listening: {
      entry: ['clearError', 'announceListening'],
      invoke: {
        id: 'whisperListener',
        src: 'whisperListener',
      },
      on: {
        VOICE_DETECTED: [
          {
            guard: 'isValidConfidence',
            actions: ['setTranscript', 'parseCommand'],
            target: 'checkingWakeWord',
          },
        ],
        ERROR: {
          target: 'error',
          actions: 'setError',
        },
        STOP: {
          target: 'idle',
        },
      },
    },

    checkingWakeWord: {
      always: [
        {
          guard: 'hasWakeWord',
          target: 'processing',
        },
        {
          target: 'listening',
        },
      ],
    },

    processing: {
      entry: 'pauseListener',
      invoke: {
        src: 'commandProcessor',
        input: ({ context }) => ({ command: context.command! }),
        onDone: {
          target: 'speaking',
          actions: 'setDescription',
        },
        onError: {
          target: 'error',
          actions: assign({
            error: ({ event }) => (event.error as Error).message,
          }),
        },
      },
    },

    speaking: {
      invoke: {
        src: 'speechSynthesizer',
        input: ({ context }) => ({
          text: context.lastDescription || 'No hay descripción',
          language: 'es-ES',
        }),
        onDone: {
          target: 'recovering',
        },
        onError: {
          target: 'recovering',
        },
      },
      on: {
        STOP: {
          target: 'idle',
        },
      },
    },

    error: {
      entry: ['setError', 'announceError'],
      on: {
        RETRY: [
          {
            guard: 'canRetry',
            target: 'recovering',
            actions: 'incrementRetry',
          },
          {
            target: 'idle',
          },
        ],
        STOP: {
          target: 'idle',
        },
      },
      after: {
        3000: [
          {
            guard: 'canRetry',
            target: 'recovering',
            actions: 'incrementRetry',
          },
        ],
      },
    },

    recovering: {
      entry: 'resumeListener',
      after: {
        500: { target: 'listening' },
      },
    },
  },
});

React Hook Integration

// presentation/hooks/useVoiceCommands.ts
import { useMachine } from '@xstate/react';
import { voiceMachine } from '../../machines/voiceMachine';
import { useCallback, useEffect } from 'react';

interface UseVoiceCommandsOptions {
  autoStart?: boolean;
  onCommand?: (command: VoiceCommand) => void;
  onDescription?: (description: string) => void;
  onError?: (error: string) => void;
}

export function useVoiceCommands(options: UseVoiceCommandsOptions = {}) {
  const { autoStart = false, onCommand, onDescription, onError } = options;

  const [snapshot, send] = useMachine(voiceMachine);

  // Auto-start if configured
  useEffect(() => {
    if (autoStart) {
      send({ type: 'START' });
    }
  }, [autoStart, send]);

  // Callbacks on state changes
  useEffect(() => {
    if (snapshot.matches('processing') && snapshot.context.command) {
      onCommand?.(snapshot.context.command);
    }
  }, [snapshot.value, snapshot.context.command, onCommand]);

  useEffect(() => {
    if (snapshot.context.lastDescription) {
      onDescription?.(snapshot.context.lastDescription);
    }
  }, [snapshot.context.lastDescription, onDescription]);

  useEffect(() => {
    if (snapshot.context.error) {
      onError?.(snapshot.context.error);
    }
  }, [snapshot.context.error, onError]);

  // Actions
  const start = useCallback(() => send({ type: 'START' }), [send]);
  const stop = useCallback(() => send({ type: 'STOP' }), [send]);
  const retry = useCallback(() => send({ type: 'RETRY' }), [send]);

  // Derived state
  const isListening = snapshot.matches('listening');
  const isProcessing = snapshot.matches('processing');
  const isSpeaking = snapshot.matches('speaking');
  const hasError = snapshot.matches('error');
  const isActive = !snapshot.matches('idle');

  return {
    // State
    state: snapshot.value,
    isListening,
    isProcessing,
    isSpeaking,
    hasError,
    isActive,
    
    // Context
    transcript: snapshot.context.transcript,
    command: snapshot.context.command,
    lastDescription: snapshot.context.lastDescription,
    error: snapshot.context.error,

    // Actions
    start,
    stop,
    retry,
    
    // For advanced usage
    send,
    snapshot,
  };
}

Component Usage

// presentation/components/organisms/VoiceCommandPanel.tsx
import { useVoiceCommands } from '../../hooks/useVoiceCommands';

export function VoiceCommandPanel() {
  const {
    isListening,
    isProcessing,
    isSpeaking,
    hasError,
    transcript,
    lastDescription,
    error,
    start,
    stop,
    retry,
  } = useVoiceCommands({
    autoStart: true,
    onCommand: (cmd) => console.log('Command:', cmd),
    onDescription: (desc) => console.log('Description:', desc),
    onError: (err) => console.error('Error:', err),
  });

  return (
    <View style={styles.container}>
      <StatusIndicator
        status={
          hasError ? 'error' :
          isSpeaking ? 'speaking' :
          isProcessing ? 'processing' :
          isListening ? 'listening' :
          'idle'
        }
      />

      {isListening && (
        <Typography variant="caption">
          {transcript || 'Di "Iris" para comenzar...'}
        </Typography>
      )}

      {isSpeaking && lastDescription && (
        <Typography>{lastDescription}</Typography>
      )}

      {hasError && (
        <View>
          <Typography variant="error">{error}</Typography>
          <Button label="Reintentar" onPress={retry} />
        </View>
      )}

      <IconButton
        icon={isListening ? 'mic-off' : 'mic'}
        label={isListening ? 'Detener' : 'Activar'}
        onPress={isListening ? stop : start}
        accessibilityHint={
          isListening
            ? 'Detiene el reconocimiento de voz'
            : 'Activa el reconocimiento de voz'
        }
      />
    </View>
  );
}

Wake Word Parser

// domain/services/WakeWordParser.ts
const WAKE_WORD = 'iris';

const COMMAND_PATTERNS = [
  { pattern: /qu[ée]\s*(hay|ves|tiene).*frente/i, intent: 'DESCRIBE' },
  { pattern: /describ[ea]/i, intent: 'DESCRIBE' },
  { pattern: /repe?t[ie]/i, intent: 'REPEAT' },
  { pattern: /ayuda/i, intent: 'HELP' },
  { pattern: /adi[]s|chao/i, intent: 'GOODBYE' },
];

export function parseWakeWord(transcript: string): VoiceCommand | null {
  const lower = transcript.toLowerCase();
  
  // Check for wake word
  if (!lower.includes(WAKE_WORD)) {
    return null;
  }

  // Extract command after wake word
  const afterWakeWord = lower.split(WAKE_WORD)[1] || '';
  
  // Match intent
  for (const { pattern, intent } of COMMAND_PATTERNS) {
    if (pattern.test(afterWakeWord)) {
      return {
        text: transcript,
        intent,
        confidence: 1.0,
        timestamp: Date.now(),
      };
    }
  }

  // Default to describe if wake word detected but no specific command
  return {
    text: transcript,
    intent: 'DESCRIBE',
    confidence: 0.8,
    timestamp: Date.now(),
  };
}

Testing the Machine

// machines/voiceMachine.test.ts
import { createActor } from 'xstate';
import { voiceMachine } from './voiceMachine';

describe('voiceMachine', () => {
  it('should start in idle state', () => {
    const actor = createActor(voiceMachine);
    actor.start();
    
    expect(actor.getSnapshot().value).toBe('idle');
  });

  it('should transition to listening on START', () => {
    const actor = createActor(voiceMachine);
    actor.start();
    
    actor.send({ type: 'START' });
    
    expect(actor.getSnapshot().value).toBe('listening');
  });

  it('should detect wake word and process command', () => {
    const actor = createActor(voiceMachine);
    actor.start();
    actor.send({ type: 'START' });
    
    actor.send({
      type: 'VOICE_DETECTED',
      transcript: 'Iris, describe lo que hay frente a mí',
      confidence: 0.9,
    });

    expect(actor.getSnapshot().context.command?.intent).toBe('DESCRIBE');
  });
});

File Structure

voice/
├── domain/
│   ├── entities/
│   │   └── VoiceCommand.ts
│   └── services/
│       └── WakeWordParser.ts
├── application/
│   ├── use-cases/
│   │   └── ProcessCommand.ts
│   └── ports/
│       └── SpeechRecognizer.ts
├── infrastructure/
│   └── adapters/
│       ├── WhisperAdapter.ts
│       └── ExpoSpeechAdapter.ts
├── machines/
│   ├── voiceMachine.ts
│   └── voiceMachine.test.ts
└── presentation/
    ├── hooks/
    │   └── useVoiceCommands.ts
    └── components/
        └── organisms/
            └── VoiceCommandPanel.tsx

Best Practices

Do Don't
Keep machine logic pure Put side effects in state transitions
Use actors for async/subscriptions Mix promises in actions
Provide callbacks via hook options Couple UI logic to machine
Test machine transitions Only test UI components
Use guards for conditions Put if/else in actions
Handle all error states Ignore error recovery
Related skills
Installs
1
First Seen
Apr 9, 2026