|
@@ -36,6 +36,7 @@ struct unity_params {
|
|
/*normalize_scores*/ true,
|
|
/*normalize_scores*/ true,
|
|
/*mem_mb*/ 512,
|
|
/*mem_mb*/ 512,
|
|
};
|
|
};
|
|
|
|
+ int32_t max_audio_s = 30;
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
@@ -50,6 +51,7 @@ void unity_print_usage(int /*argc*/, char ** argv, const unity_params & params)
|
|
fprintf(stderr, " --text text output\n");
|
|
fprintf(stderr, " --text text output\n");
|
|
fprintf(stderr, " --beam-size beam size (default: %d)\n", params.opts.beam_size);
|
|
fprintf(stderr, " --beam-size beam size (default: %d)\n", params.opts.beam_size);
|
|
fprintf(stderr, " -M, --mem memory buffer, increase for long inputs (default: %d)\n", params.opts.mem_mb);
|
|
fprintf(stderr, " -M, --mem memory buffer, increase for long inputs (default: %d)\n", params.opts.mem_mb);
|
|
|
|
+ fprintf(stderr, " --max-audio max duration of audio in seconds\n", params.max_audio_s);
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
|
|
|
|
@@ -81,6 +83,8 @@ bool unity_params_parse(int argc, char ** argv, unity_params & params) {
|
|
params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
|
params.opts.beam_size = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
|
} else if (arg == "-M" || arg == "--mem") {
|
|
} else if (arg == "-M" || arg == "--mem") {
|
|
params.opts.mem_mb = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
|
params.opts.mem_mb = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
|
|
|
+ } else if (arg == "--max-audio") {
|
|
|
|
+ params.max_audio_s = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
|
} else {
|
|
} else {
|
|
params.files.push_back(std::string(arg));
|
|
params.files.push_back(std::string(arg));
|
|
}
|
|
}
|
|
@@ -189,8 +193,8 @@ int main(int argc, char ** argv) {
|
|
ggml_set_no_alloc(model.ctx, true);
|
|
ggml_set_no_alloc(model.ctx, true);
|
|
GGML_ASSERT(info.samplerate == 16000);
|
|
GGML_ASSERT(info.samplerate == 16000);
|
|
GGML_ASSERT(info.channels == 1);
|
|
GGML_ASSERT(info.channels == 1);
|
|
- // stop at 30s. Ideally we should chunk input audio, but this will prevent most obvious OOM.
|
|
|
|
- int n_frames = std::min(info.samplerate * 30, (int)info.frames);
|
|
|
|
|
|
+ // Truncate audio input. Ideally we should chunk it, but this will prevent most obvious OOM.
|
|
|
|
+ int n_frames = std::min(info.samplerate * params.max_audio_s, (int)info.frames);
|
|
ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_frames, info.channels);
|
|
ggml_tensor* seqs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_frames, info.channels);
|
|
ggml_allocr_alloc(fwd_alloc, seqs);
|
|
ggml_allocr_alloc(fwd_alloc, seqs);
|
|
|
|
|