# this is supposed to strip silences from raw audio files # and split the voice when a long silence is encountered # (e.g. at the end of a sentence) # works with linear 16-bit signed native byte-order audio use stdio.h use signal.h use unistd.h use fcntl.h use sys/types.h use sys/ioctl.h use sys/soundcard.h use alloc use error use m # just for "real" at the moment! use env typedef short i16 const int bytes_per_sample = 2 num max_value = 32767 # samples per second int sample_rate # what is the maximum amplitude of silence; samples range from -1.0 to 1.0, # so threshold of 0.1 means silence is from -0.1 to 0.1 num silence_max_amp num voice_min_amp # voice_min_amp is a condition for a sample to be saved, # but doesn't define the edges of the sample # minimum seconds of silence for a break - e.g. 0.7 num break_min_sec # how many second of silence to keep on either side of voice # (should be less than 1/2 of break_min_sec) - e.g. 0.2 num padding_before_sec num padding_after_sec num voice_min_sec num skip_initial_spike_sec char *filename_template i16 silence_min_value i16 silence_max_value i16 voice_low_value i16 voice_high_value int break_min_samples int padding_before_samples int padding_after_samples int voice_min_samples int voice_samples int skip_initial_spike_samples def max_filename_length 256 int file_count char filename[max_filename_length + 1] FILE *file i16 v int buffer_size i16 *buf int buffer_index int samples_in_buffer int loud_enough_to_be_voice int count int interrupted = 0 sigint_handler(int sig) interrupted = 1 int resample_8bit = 0 # direct dsp support, because sox buffers too much in the pipe, we lose immediacy setup_dsp() int arg, status int fd = STDIN_FILENO close(fd) fd = open("/dev/dsp", O_RDONLY) # this RESET apparently sets the "rec" mixer setting to 100% status = ioctl(fd, SNDCTL_DSP_RESET, 0) if status == -1 serror("SNDCTL_DSP_RESET failed") arg = 0 status = ioctl(fd, SNDCTL_DSP_STEREO, &arg) if status == -1 serror("SNDCTL_DSP_STEREO failed") if arg != 0 serror("unable to set number of channels") arg = AFMT_S16_LE status = ioctl(fd, SNDCTL_DSP_SETFMT, &arg) if status == -1 serror("SNDCTL_DSP_SETFMT ioctl failed") if arg != AFMT_S16_LE warn("unable to sample with 16bits LE - will resample") resample_8bit = 1 arg = AFMT_U8 status = ioctl(fd, SNDCTL_DSP_SETFMT, &arg) if arg != AFMT_U8 error("unable to sample at 8bits") arg = sample_rate status = ioctl(fd, SNDCTL_DSP_SPEED, &arg) if status == -1 serror("SNDCTL_DSP_SPEED ioctl failed") if arg != sample_rate warn("sample rate set to %d\n", arg) sample_rate = arg # make sure the delay is pretty small - 256 / 16000 of a sec char *stdin_buf = Malloc(256) if setvbuf(stdin, stdin_buf, _IOFBF, 256) != 0 serror("setvbuf failed") int use_dsp int main(int argc, char **argv) if argc != 1 error( \ "arguments should be passed in the environment:\n" \ " sample_rate silence_max_amp break_min_sec padding_before_sec padding_after_sec voice_min_sec\n" \ " voice_min_amp filename_template initial_file_count\n" \ " filename_template is printf-style, takes 1 integer argument, e.g. sentence_%%05d", argv[0]) # handle sig_int signal(SIGINT, sigint_handler) signal(SIGPIPE, sigint_handler) use_dsp = atoi(Getenv("use_dsp", "1")) sample_rate = atoi(Getenv("sample_rate", "8000")) silence_max_amp = atof(Getenv("silence_max_amp", "0.2")) voice_min_amp = atof(Getenv("voice_min_amp", "0.40")) break_min_sec = atof(Getenv("break_min_sec", "0.8")) padding_before_sec = atof(Getenv("padding_before_sec", "0.2")) padding_after_sec = atof(Getenv("padding_after_sec", "0.6")) voice_min_sec = atof(Getenv("voice_min_sec", "0.2")) skip_initial_spike_sec = atof(Getenv("skip_initial_spike_sec", "0.25")) filename_template = Getenv("filename_template", "sample_%06d.raw") file_count = atoi(Getenv("initial_file_count", "1")) silence_min_value = (int)(max_value * - silence_max_amp) silence_max_value = (int)(max_value * silence_max_amp) voice_low_value = (int)(max_value * - voice_min_amp) voice_high_value = (int)(max_value * voice_min_amp) break_min_samples = (int)(sample_rate * break_min_sec) padding_before_samples = (int)(sample_rate * padding_before_sec) padding_after_samples = (int)(sample_rate * padding_after_sec) voice_min_samples = (int)(sample_rate * voice_min_sec) skip_initial_spike_samples = (int)(sample_rate * skip_initial_spike_sec) if padding_before_sec + padding_after_sec > break_min_samples error("padding_before_sec + padding_after_sec must be <= break_min_sec") buffer_size = break_min_samples buf = Malloc(buffer_size * bytes_per_sample) buffer_index = 0 samples_in_buffer = 0 if use_dsp setup_dsp() # skip ~1/4 sec to avoid spike at start of recording # a bit busy but safe while (skip_initial_spike_samples > 0) { count = read_one_sample() --skip_initial_spike_samples } start_silence . #warning("start silence\n"); # silence state: look for a non-silent amplitude, and keep up to # padding_before_samples samples before it, then output padding and switch to # voice state ; if get to eof, stop silence . count = read_one_sample() if count != 1 || interrupted # eof (or error) finished if v >= silence_min_value && v <= silence_max_value # silence - push to buffer push_to_buffer(v) # continue in silence state silence else # non-silence - open a new output file, flush buffer and output # this sample; then switch to voice state int rv = snprintf(filename, max_filename_length, filename_template, file_count) ++ file_count if rv >= max_filename_length || rv < 0 # can't create filename error("filename too long; maximum length is %d", max_filename_length) file = fopen(filename, "w") if file == NULL serror("could not create output file %s", filename) # flush buffer to new file if samples_in_buffer > padding_before_samples samples_in_buffer = padding_before_samples write_buffer(samples_in_buffer, samples_in_buffer) # output this non-silent sample write_one_sample(v) # switch to voice state start_voice # voice state: keep track of how many samples since last non-silence. # Buffer silence until get to a non-silence or more than # break_min_samples (buffer needs to be break_min_samples in size). If # got to an non-silence, output all the silence and the non-silent # value, and reset "samples since last non-silence" to 0; else if more # than break_min_samples of silence, output first padding_after_samples of # buffered silence, keep last padding_before_samples of buffered silence in # buffer, close file, output filename, and switch to silence state ; if # get to eof, output buffered silence (not more than padding_after_samples) # and stop start_voice . #warning("start voice\n"); voice_samples = 0 samples_in_buffer = 0 loud_enough_to_be_voice = 0 voice . count = read_one_sample() if count != 1 || interrupted # eof (or error) if samples_in_buffer > padding_after_samples samples_in_buffer = padding_after_samples write_buffer(samples_in_buffer, samples_in_buffer) close_file() finished if v >= voice_high_value || v <= voice_low_value loud_enough_to_be_voice = 1 if v >= silence_min_value && v <= silence_max_value # silence - push to buffer push_to_buffer(v) if samples_in_buffer < break_min_samples # continue in voice state voice else # we have a break # output first padding_after_samples of buffer write_buffer(samples_in_buffer, padding_after_samples) # keep last padding_before_samples of buffered silence samples_in_buffer = padding_before_samples close_file() # switch to silence state start_silence else # non-silence - output all the silence and the non-silent # value, and reset "samples since last non-silence" to 0 write_buffer(samples_in_buffer, samples_in_buffer) write_one_sample(v) voice_samples += samples_in_buffer + 1 samples_in_buffer = 0 # continue in voice state voice finished . Free(buf) return 0 write_buffer(int start, int size) if size == 0 return # start is an offset back in time from the buffer index start = buffer_index - start if start < 0 start += buffer_size int end_of_buf = buffer_size - start if size > end_of_buf count = fwrite(buf + start, bytes_per_sample, end_of_buf, file) #warning("wrote silence %d\n", end_of_buf) if count != end_of_buf serror("can't write to file") size -= end_of_buf start = 0 if size > 0 count = fwrite(buf + start, bytes_per_sample, size, file) #warning("wrote silence %d\n", size) if count != size serror("can't write to file") write_one_sample(i16 value) count = fwrite(&value, bytes_per_sample, 1, file) if count != 1 serror("can't write to file") push_to_buffer(i16 v) buf[buffer_index] = v ++buffer_index if buffer_index == buffer_size buffer_index = 0 ++ samples_in_buffer # note that samples_in_buffer may become more than buffer_size! close_file() # close file and output filename fclose(file) #warning("close file") if voice_samples >= voice_min_samples && loud_enough_to_be_voice #warning("saved") printf("%s\n", filename) fflush(stdout) else #warning("removed") remove(filename) -- file_count int cmin = 1232, cmax = 0 int vmin = 999999, vmax = 0 int read_one_sample() int count if resample_8bit unsigned char c count = fread(&c, 1, 1, stdin) #printf("c: %d\n", c) if count == 1 v = (c - 0x80) * 0x100 #printf("v: %d\n", v) if c < cmin cmin = c printf("cmin %d\n", cmin) if c > cmax cmax = c printf("cmax %d\n", cmax) if v < vmin vmin = v printf("vmin %d\n", vmin) if v > vmax vmax = v printf("vmax %d\n", vmax) else count = fread(&v, 2, 1, stdin) return count