voice2subs/voice2subs.sh

#!/usr/bin/env bash

set -e

# Copyright (C) 2021 Tessa Nordgren
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

RESAMPLING_OPTS="-af aresample=resampler=soxr:dither_method=shibata:precision=28:out_channel_layout=mono -ar 16000"
ML_SPEECH_OPTS="--language-code=en --include-word-time-offsets --encoding=ogg-opus --sample-rate=16000"

function check_deps() {
  for TOOL in gcloud ffmpeg python3; do
    if [[ ! -f "$(which ${TOOL})" ]]; then
      echo "missing ${TOOL}, please install!"
      return 1
    fi
  done
  return 0
}

function usage() {
  echo "Usage: $(basename $0) [-h] video [video ...]"
  echo -e "\tvideo\tvideo file(s) to convert audio to subtitles."
  echo -e "\t-h\tshows this usage message."
  echo -e "\t-n\tdon't include audio in output."
}

INCLUDE_AUDIO=1
function argparse() {
  if [[ ${#} -eq 0 ]]; then
     echo -e "$0: must supply one or more video files to process\n" >&2
     usage
     return 1
  fi
  optstring=":hn"
  while getopts "${optstring}" arg; do
    case "${arg}" in
      h)
        usage
        exit 0
        ;;
      n)
        export INCLUDE_AUDIO=0
        ;;
      :)
        echo -e "$0: must supply argument to -$OPTARG.\n" >&2
        exit 1
        ;;
      ?)
        echo -e "$0: invalid option: -${OPTARG}\n" >&2
        usage
        exit 2
        ;;
    esac
  done
  shift $(($OPTIND - 1))
  export ARGS=( "$@" )
}

function title() {
  echo "Processing '$1'..."
  COUNT="$(seq 0 ${#1})"
  UNDERLINE="$(printf '=%.0s' ${COUNT})"
  echo "------------${UNDERLINE}----"
}

check_deps
argparse "$@"
mkdir -p tmp
for FILE in "${ARGS[@]}"; do
  title "${FILE}"
  BASENAME="$(basename "${FILE%.*}")"
  AUDIO="tmp/${BASENAME}.opus"
  YML="tmp/${BASENAME}.yml"
  SRT="tmp/${BASENAME}.srt"
  OUT="${FILE%.*}_with_subs.mkv"

  if [[ ! -f "${AUDIO}" ]]; then
    echo "extracting audio..."
    ffmpeg -loglevel error -y -i "${FILE}" ${RESAMPLING_OPTS} "${AUDIO}"
  fi
  if [[ ! -f "${YML}" ]]; then
    echo "converting audio to text..."
    gcloud -q --format yaml ml speech recognize-long-running "${AUDIO}" ${ML_SPEECH_OPTS} > "${YML}"
  fi
  if [[ ! -f ${SRT} ]]; then
    echo "converting google yaml data to subtitle data..."
    ./ml2srt.py "${YML}"
  fi
  if [[ "${INCLUDE_AUDIO}" == 0 ]]; then
    AUDIO_OPTS="-an"
  fi

  ffmpeg -loglevel error -y  -i "${FILE}" -i "${SRT}" ${AUDIO_OPTS} -c copy "${OUT}"
  echo -e "Finished, result is in: '${OUT}'\n"
done