import { useCallback, useEffect, useRef, useState } from 'react'

const SILENCE_THRESHOLD = -40 // dB
const SPEECH_THRESHOLD = -26 // dB
const SILENCE_DURATION = 2500 // ms

const transcribeAudio = async (audioBlob: Blob, setInput: (input: string) => void) => {
  try {
    const response = await fetch('/api/transcribe', {
      method: 'POST',
      headers: {
        'Content-Type': 'audio/wav',
      },
      body: audioBlob,
    })
    const res = await response.json()
    if (res.transcription?.length > 0) {
      setInput(res.transcription)
    }
  } catch (error) {
    console.error('Error transcribing audio:', error)
  }
}

const formatDuration = (durationInSeconds: number): string => {
  const minutes = Math.floor(durationInSeconds / 60)
  const seconds = durationInSeconds % 60
  return `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`
}

const useWhisperTranscription = (
  setInput: (input: string) => void,
  isPersonaTyping: boolean,
  silenceDuration = SILENCE_DURATION,
) => {
  const [isListening, setIsListening] = useState<boolean>(false)
  const [isProcessing, setIsProcessing] = useState<boolean>(false)
  const [duration, setDuration] = useState<number>(0)
  const [isEnabled, setIsEnabled] = useState<boolean>(false)
  const durationIntervalRef = useRef<NodeJS.Timeout | null>(null)
  const audioChunks = useRef<Blob[]>([])
  const mediaRecorderRef = useRef<MediaRecorder | null>(null)
  const audioContextRef = useRef<AudioContext | null>(null)
  const analyserRef = useRef<AnalyserNode | null>(null)
  const silenceStartRef = useRef<number | null>(null)
  const speechDetectedRef = useRef<boolean>(false)
  const rafRef = useRef<number | null>(null)

  const stopListening = useCallback(async () => {
    if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
      mediaRecorderRef.current.stop()
      setIsListening(false)
      setIsProcessing(true)
      setDuration(0)
      silenceStartRef.current = null

      if (durationIntervalRef.current) {
        clearInterval(durationIntervalRef.current)
      }

      const audioBlob = new Blob(audioChunks.current, { type: 'audio/wav' })
      await transcribeAudio(audioBlob, setInput)

      setIsProcessing(false)
      audioChunks.current = []
    }

    if (audioContextRef.current) {
      audioContextRef.current.close()
    }

    if (rafRef.current) {
      cancelAnimationFrame(rafRef.current)
    }

    speechDetectedRef.current = false
  }, [setInput])

  const detectVoiceActivity = useCallback(() => {
    if (!analyserRef.current) return

    const bufferLength = analyserRef.current.frequencyBinCount
    const dataArray = new Uint8Array(bufferLength)

    function getAudioLevel() {
      analyserRef.current!.getByteFrequencyData(dataArray)
      const average = dataArray.reduce((sum, value) => sum + value, 0) / bufferLength
      const db = 20 * Math.log10(average / 255)
      return db
    }

    function checkSpeech(db: number) {
      if (db > SPEECH_THRESHOLD) {
        speechDetectedRef.current = true
      }
    }

    const checkAudioLevel = () => {
      const db = getAudioLevel()

      checkSpeech(db)

      // if speechDetectedRef.current is false, that means nothing was said yet, so we shouldn't be detecting silence in that case
      if (db < SILENCE_THRESHOLD && speechDetectedRef.current) {
        if (silenceStartRef.current === null) {
          silenceStartRef.current = Date.now()
        } else if (Date.now() - silenceStartRef.current > silenceDuration) {
          stopListening()
          return
        }
      } else {
        silenceStartRef.current = null
      }

      rafRef.current = requestAnimationFrame(checkAudioLevel)
    }

    checkAudioLevel()
  }, [stopListening])

  const startListening = useCallback(async () => {
    if (isPersonaTyping || isListening || isProcessing) return
    try {
      setIsProcessing(true)
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
      const mediaRecorder = new MediaRecorder(stream)
      mediaRecorderRef.current = mediaRecorder

      audioContextRef.current = new AudioContext()
      analyserRef.current = audioContextRef.current.createAnalyser()
      const source = audioContextRef.current.createMediaStreamSource(stream)
      source.connect(analyserRef.current)

      mediaRecorder.ondataavailable = (event) => {
        audioChunks.current.push(event.data)
      }

      mediaRecorder.onstart = () => {
        audioChunks.current = []
        setIsListening(true)
        setIsProcessing(false)
        detectVoiceActivity()
        setDuration(0)

        if (durationIntervalRef.current) {
          clearInterval(durationIntervalRef.current)
        }

        durationIntervalRef.current = setInterval(() => {
          setDuration((prev) => prev + 1)
        }, 1000)
      }

      mediaRecorder.start(1000)
    } catch (error) {
      console.error('Error accessing microphone:', error)
      setIsProcessing(false)
    }
  }, [isPersonaTyping, isListening, isProcessing, detectVoiceActivity])

  useEffect(() => {
    return () => {
      if (durationIntervalRef.current) {
        clearInterval(durationIntervalRef.current)
      }
      if (audioContextRef.current) {
        audioContextRef.current.close()
      }
      if (rafRef.current) {
        cancelAnimationFrame(rafRef.current)
      }
    }
  }, [])

  useEffect(() => {
    if (isEnabled && !isPersonaTyping) {
      startListening()
    }
  }, [isEnabled, isPersonaTyping, startListening])

  const toggleRecording = useCallback(() => {
    if (isListening) {
      stopListening()
      setIsEnabled(false)
    } else {
      setIsEnabled(true)
      startListening()
    }
  }, [isListening, stopListening, startListening])

  return { duration, formatDuration, isListening, isProcessing, toggleRecording }
}

export default useWhisperTranscription
