133 lines
4.1 KiB
Python
133 lines
4.1 KiB
Python
|
"""
|
||
|
Script for extracting DeepSpeech features from audio file.
|
||
|
"""
|
||
|
|
||
|
import os
|
||
|
import argparse
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from deepspeech_store import get_deepspeech_model_file
|
||
|
from deepspeech_features import conv_audios_to_deepspeech
|
||
|
|
||
|
|
||
|
def parse_args():
|
||
|
"""
|
||
|
Create python script parameters.
|
||
|
Returns
|
||
|
-------
|
||
|
ArgumentParser
|
||
|
Resulted args.
|
||
|
"""
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description="Extract DeepSpeech features from audio file",
|
||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||
|
parser.add_argument(
|
||
|
"--input",
|
||
|
type=str,
|
||
|
required=True,
|
||
|
help="path to input audio file or directory")
|
||
|
parser.add_argument(
|
||
|
"--output",
|
||
|
type=str,
|
||
|
help="path to output file with DeepSpeech features")
|
||
|
parser.add_argument(
|
||
|
"--deepspeech",
|
||
|
type=str,
|
||
|
help="path to DeepSpeech 0.1.0 frozen model")
|
||
|
parser.add_argument(
|
||
|
"--metainfo",
|
||
|
type=str,
|
||
|
help="path to file with meta-information")
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
return args
|
||
|
|
||
|
|
||
|
def extract_features(in_audios,
|
||
|
out_files,
|
||
|
deepspeech_pb_path,
|
||
|
metainfo_file_path=None):
|
||
|
"""
|
||
|
Real extract audio from video file.
|
||
|
Parameters
|
||
|
----------
|
||
|
in_audios : list of str
|
||
|
Paths to input audio files.
|
||
|
out_files : list of str
|
||
|
Paths to output files with DeepSpeech features.
|
||
|
deepspeech_pb_path : str
|
||
|
Path to DeepSpeech 0.1.0 frozen model.
|
||
|
metainfo_file_path : str, default None
|
||
|
Path to file with meta-information.
|
||
|
"""
|
||
|
#deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
|
||
|
if metainfo_file_path is None:
|
||
|
num_frames_info = [None] * len(in_audios)
|
||
|
else:
|
||
|
train_df = pd.read_csv(
|
||
|
metainfo_file_path,
|
||
|
sep="\t",
|
||
|
index_col=False,
|
||
|
dtype={"Id": np.int, "File": np.unicode, "Count": np.int})
|
||
|
num_frames_info = train_df["Count"].values
|
||
|
assert (len(num_frames_info) == len(in_audios))
|
||
|
|
||
|
for i, in_audio in enumerate(in_audios):
|
||
|
if not out_files[i]:
|
||
|
file_stem, _ = os.path.splitext(in_audio)
|
||
|
out_files[i] = file_stem + ".npy"
|
||
|
#print(out_files[i])
|
||
|
conv_audios_to_deepspeech(
|
||
|
audios=in_audios,
|
||
|
out_files=out_files,
|
||
|
num_frames_info=num_frames_info,
|
||
|
deepspeech_pb_path=deepspeech_pb_path)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
"""
|
||
|
Main body of script.
|
||
|
"""
|
||
|
args = parse_args()
|
||
|
in_audio = os.path.expanduser(args.input)
|
||
|
if not os.path.exists(in_audio):
|
||
|
raise Exception("Input file/directory doesn't exist: {}".format(in_audio))
|
||
|
deepspeech_pb_path = args.deepspeech
|
||
|
#add
|
||
|
deepspeech_pb_path = True
|
||
|
args.deepspeech = '~/.tensorflow/models/deepspeech-0_1_0-b90017e8.pb'
|
||
|
#deepspeech_pb_path="/disk4/keyu/DeepSpeech/deepspeech-0.9.2-models.pbmm"
|
||
|
if deepspeech_pb_path is None:
|
||
|
deepspeech_pb_path = ""
|
||
|
if deepspeech_pb_path:
|
||
|
deepspeech_pb_path = os.path.expanduser(args.deepspeech)
|
||
|
if not os.path.exists(deepspeech_pb_path):
|
||
|
deepspeech_pb_path = get_deepspeech_model_file()
|
||
|
if os.path.isfile(in_audio):
|
||
|
extract_features(
|
||
|
in_audios=[in_audio],
|
||
|
out_files=[args.output],
|
||
|
deepspeech_pb_path=deepspeech_pb_path,
|
||
|
metainfo_file_path=args.metainfo)
|
||
|
else:
|
||
|
audio_file_paths = []
|
||
|
for file_name in os.listdir(in_audio):
|
||
|
if not os.path.isfile(os.path.join(in_audio, file_name)):
|
||
|
continue
|
||
|
_, file_ext = os.path.splitext(file_name)
|
||
|
if file_ext.lower() == ".wav":
|
||
|
audio_file_path = os.path.join(in_audio, file_name)
|
||
|
audio_file_paths.append(audio_file_path)
|
||
|
audio_file_paths = sorted(audio_file_paths)
|
||
|
out_file_paths = [""] * len(audio_file_paths)
|
||
|
extract_features(
|
||
|
in_audios=audio_file_paths,
|
||
|
out_files=out_file_paths,
|
||
|
deepspeech_pb_path=deepspeech_pb_path,
|
||
|
metainfo_file_path=args.metainfo)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|
||
|
|