# this is supposed to strip silences from raw audio files
# and split the voice when a long silence is encountered 
# (e.g. at the end of a sentence)

# works with linear 16-bit signed native byte-order audio

use stdio.h

use signal.h
use unistd.h
use fcntl.h
use sys/types.h
use sys/ioctl.h
use sys/soundcard.h

use alloc
use error
use m
	# just for "real" at the moment!
use env

typedef short i16
const int bytes_per_sample = 2
num max_value = 32767

# samples per second
int sample_rate
# what is the maximum amplitude of silence; samples range from -1.0 to 1.0,
# so threshold of 0.1 means silence is from -0.1 to 0.1
num silence_max_amp
num voice_min_amp
	# voice_min_amp is a condition for a sample to be saved,
	# but doesn't define the edges of the sample
# minimum seconds of silence for a break - e.g. 0.7
num break_min_sec
# how many second of silence to keep on either side of voice
# (should be less than 1/2 of break_min_sec) - e.g. 0.2
num padding_before_sec
num padding_after_sec
num voice_min_sec
num skip_initial_spike_sec

char *filename_template

i16 silence_min_value
i16 silence_max_value
i16 voice_low_value
i16 voice_high_value
int break_min_samples
int padding_before_samples
int padding_after_samples
int voice_min_samples
int voice_samples
int skip_initial_spike_samples

def max_filename_length 256

int file_count
char filename[max_filename_length + 1]
FILE *file

i16 v

int buffer_size
i16 *buf

int buffer_index
int samples_in_buffer
int loud_enough_to_be_voice

int count

int interrupted = 0
sigint_handler(int sig)
	interrupted = 1

int resample_8bit = 0

# direct dsp support, because sox buffers too much in the pipe, we lose immediacy
setup_dsp()
	int arg, status
	
	int fd = STDIN_FILENO

	close(fd)
	fd = open("/dev/dsp", O_RDONLY)
	
	# this RESET apparently sets the "rec" mixer setting to 100%
	status = ioctl(fd, SNDCTL_DSP_RESET, 0)
	if status == -1
		serror("SNDCTL_DSP_RESET failed")

	arg = 0
	status = ioctl(fd, SNDCTL_DSP_STEREO, &arg)
	if status == -1
		serror("SNDCTL_DSP_STEREO failed")
	if arg != 0
		serror("unable to set number of channels")
	
	arg = AFMT_S16_LE
	status = ioctl(fd, SNDCTL_DSP_SETFMT, &arg)
	if status == -1
		serror("SNDCTL_DSP_SETFMT ioctl failed")
	if arg != AFMT_S16_LE
		warn("unable to sample with 16bits LE - will resample")
		resample_8bit = 1
		arg = AFMT_U8
		status = ioctl(fd, SNDCTL_DSP_SETFMT, &arg)
		if arg != AFMT_U8
			error("unable to sample at 8bits")
	
	arg = sample_rate
	status = ioctl(fd, SNDCTL_DSP_SPEED, &arg)
	if status == -1
		serror("SNDCTL_DSP_SPEED ioctl failed")
	if arg != sample_rate
		warn("sample rate set to %d\n", arg)
		sample_rate = arg

	# make sure the delay is pretty small - 256 / 16000 of a sec
	char *stdin_buf = Malloc(256)
	if setvbuf(stdin, stdin_buf, _IOFBF, 256) != 0
		serror("setvbuf failed")

int use_dsp

int main(int argc, char **argv)
	if argc != 1
		error( \
			"arguments should be passed in the environment:\n" \
			"  sample_rate silence_max_amp break_min_sec padding_before_sec padding_after_sec voice_min_sec\n" \
			"  voice_min_amp filename_template initial_file_count\n" \
			"  filename_template is printf-style, takes 1 integer argument, e.g. sentence_%%05d", argv[0])

	# handle sig_int
	signal(SIGINT, sigint_handler)
	signal(SIGPIPE, sigint_handler)
	
	use_dsp = atoi(Getenv("use_dsp", "1"))
	sample_rate = atoi(Getenv("sample_rate", "8000"))
	silence_max_amp = atof(Getenv("silence_max_amp", "0.2"))
	voice_min_amp = atof(Getenv("voice_min_amp", "0.40"))
	break_min_sec = atof(Getenv("break_min_sec", "0.8"))
	padding_before_sec = atof(Getenv("padding_before_sec", "0.2"))
	padding_after_sec = atof(Getenv("padding_after_sec", "0.6"))
	voice_min_sec = atof(Getenv("voice_min_sec", "0.2"))
	skip_initial_spike_sec = atof(Getenv("skip_initial_spike_sec", "0.25"))
	filename_template = Getenv("filename_template", "sample_%06d.raw")
	file_count = atoi(Getenv("initial_file_count", "1"))
	
	silence_min_value = (int)(max_value * - silence_max_amp)
	silence_max_value = (int)(max_value * silence_max_amp)
	voice_low_value = (int)(max_value * - voice_min_amp)
	voice_high_value = (int)(max_value * voice_min_amp)
	
	break_min_samples = (int)(sample_rate * break_min_sec)
	padding_before_samples = (int)(sample_rate * padding_before_sec)
	padding_after_samples = (int)(sample_rate * padding_after_sec)
	voice_min_samples = (int)(sample_rate * voice_min_sec)
	skip_initial_spike_samples = (int)(sample_rate * skip_initial_spike_sec)
	
	if padding_before_sec + padding_after_sec > break_min_samples
		error("padding_before_sec + padding_after_sec must be <= break_min_sec")
	
	buffer_size = break_min_samples
	buf = Malloc(buffer_size * bytes_per_sample)
	
	buffer_index = 0
	samples_in_buffer = 0

	if use_dsp
		setup_dsp()
		# skip ~1/4 sec to avoid spike at start of recording
		# a bit busy but safe
		while (skip_initial_spike_samples > 0) {
			count = read_one_sample()
			--skip_initial_spike_samples
		}

start_silence	.
	#warning("start silence\n");

	# silence state: look for a non-silent amplitude, and keep up to
	# padding_before_samples samples before it, then output padding and switch to
	# voice state ; if get to eof, stop
silence	.
	count = read_one_sample()
	if count != 1 || interrupted
		# eof (or error)
		finished

	if v >= silence_min_value && v <= silence_max_value
		# silence - push to buffer
		push_to_buffer(v)
		
		# continue in silence state
		silence
	else
		# non-silence - open a new output file, flush buffer and output
		# this sample; then switch to voice state
		int rv = snprintf(filename, max_filename_length, filename_template, file_count)
		++ file_count
		if rv >= max_filename_length || rv < 0
			# can't create filename
			error("filename too long; maximum length is %d", max_filename_length)
		file = fopen(filename, "w")
		if file == NULL
			serror("could not create output file %s", filename)

		# flush buffer to new file
		if samples_in_buffer > padding_before_samples
			samples_in_buffer = padding_before_samples
		write_buffer(samples_in_buffer, samples_in_buffer)

		# output this non-silent sample
		write_one_sample(v)

		# switch to voice state
		start_voice


	# voice state: keep track of how many samples since last non-silence.
	# Buffer silence until get to a non-silence or more than
	# break_min_samples (buffer needs to be break_min_samples in size).  If
	# got to an non-silence, output all the silence and the non-silent
	# value, and reset "samples since last non-silence" to 0; else if more
	# than break_min_samples of silence, output first padding_after_samples of
	# buffered silence, keep last padding_before_samples of buffered silence in
	# buffer, close file, output filename, and switch to silence state ; if
	# get to eof, output buffered silence (not more than padding_after_samples)
	# and stop
start_voice	.
	#warning("start voice\n");
	voice_samples = 0
	samples_in_buffer = 0
	loud_enough_to_be_voice = 0
	
voice	.
	count = read_one_sample()
	if count != 1 || interrupted
		# eof (or error)
		if samples_in_buffer > padding_after_samples
			samples_in_buffer = padding_after_samples
		write_buffer(samples_in_buffer, samples_in_buffer)
		close_file()
		finished
	
	if v >= voice_high_value || v <= voice_low_value
		loud_enough_to_be_voice = 1

	if v >= silence_min_value && v <= silence_max_value
		# silence - push to buffer
		push_to_buffer(v)
		
		if samples_in_buffer < break_min_samples
			# continue in voice state
			voice
		else
			# we have a break
			# output first padding_after_samples of buffer
			write_buffer(samples_in_buffer, padding_after_samples)
			# keep last padding_before_samples of buffered silence
			samples_in_buffer = padding_before_samples
			close_file()
			# switch to silence state
			start_silence
	else
		# non-silence - output all the silence and the non-silent
		# value, and reset "samples since last non-silence" to 0
		write_buffer(samples_in_buffer, samples_in_buffer)
		write_one_sample(v)
		voice_samples += samples_in_buffer + 1
		samples_in_buffer = 0
		
		# continue in voice state
		voice

finished	.
	Free(buf)

	return 0


write_buffer(int start, int size)
	if size == 0
		return
	# start is an offset back in time from the buffer index
	start = buffer_index - start
	if start < 0
		start += buffer_size
	int end_of_buf = buffer_size - start
	if size > end_of_buf
		count = fwrite(buf + start, bytes_per_sample, end_of_buf, file)
		#warning("wrote silence %d\n", end_of_buf)
		if count != end_of_buf
			serror("can't write to file")
		size -= end_of_buf
		start = 0
	if size > 0
		count = fwrite(buf + start, bytes_per_sample, size, file)
		#warning("wrote silence %d\n", size)
		if count != size
			serror("can't write to file")

write_one_sample(i16 value)
	count = fwrite(&value, bytes_per_sample, 1, file)
	if count != 1
		serror("can't write to file")

push_to_buffer(i16 v)
	buf[buffer_index] = v
	++buffer_index
	if buffer_index == buffer_size
		buffer_index = 0
	++ samples_in_buffer
	# note that samples_in_buffer may become more than buffer_size!

close_file()
	# close file and output filename
	fclose(file)
	#warning("close file")
	if voice_samples >= voice_min_samples && loud_enough_to_be_voice
		#warning("saved")
		printf("%s\n", filename)
		fflush(stdout)
	else
		#warning("removed")
		remove(filename)
		-- file_count

int cmin = 1232, cmax = 0
int vmin = 999999, vmax = 0

int read_one_sample()
	int count
	if resample_8bit
		unsigned char c
		count = fread(&c, 1, 1, stdin)
		#printf("c: %d\n", c)
		if count == 1
			v = (c - 0x80) * 0x100
			#printf("v: %d\n", v)
			if c < cmin
				cmin = c
				printf("cmin %d\n", cmin)
			if c > cmax
				cmax = c
				printf("cmax %d\n", cmax)
			if v < vmin
				vmin = v
				printf("vmin %d\n", vmin)
			if v > vmax
				vmax = v
				printf("vmax %d\n", vmax)
	else
		count = fread(&v, 2, 1, stdin)
	return count