#!/usr/bin/env python3
# rsc_to_tsv.py - Convert Rec format to TSV/CSV format and vice versa

import sys
import argparse
import re
import csv
from typing import List, Dict, Union, Tuple, Optional
from io import TextIOWrapper

# The Rec format is similar to Debian's control file format or RFC 822, but not exactly the same.
# - Each record is separated by a blank line.
# - Each record consists of key-value pairs, separated by a colon.
# - Values may span multiple lines, if the continuation line starts with a space.
# - Values may contain newlines, if the continuation line starts with a space.
# - Values may contain leading dots, if the continuation line starts with a space and a dot.
# - The leading dot is for readability, and is not part of the value.

# Example of a file having two records:

# Card 1:
# Front: What is a regular expression in Python?
# Back: A sequence of characters defining a search pattern to match strings or manipulate text. Python has the re module for regular expressions.
# Hints: Pattern matching, search, text manipulation
# Extra: Basic pattern: \d matches any digit. Example: re.search(r'\d', 'abc123')
# 
# Card 2:
# Front: What is a positive lookahead assertion?
# Back: A lookahead assertion that checks if a pattern is followed by another pattern, without consuming characters. Syntax: (?=pattern)
# Hints: Asserts pattern presence, non-consuming
# Extra: Example: re.search(r'\d(?=\w)', '123abc')

# This is a natural format for storing flashcards, but we need to convert it to TSV/CSV format for importing into Anki.
# It is easy to get GPT to generate a file in this format, but it is not easy to get GPT to generate a file in TSV/CSV format.
# We could also generate flashcard data in markdown table format, but that is not as easy to edit as Rec format.

# TODO Enhance this converstion tool to also handle markdown table format.
# TODO Use a streaming approach.
# TODO Check out rec2csv and GNU recutils.
# TODO I also have older scripts in Perl: recs2tsv and tsv2recs, which probably work and can do streaming.

def read_records(input_stream: TextIOWrapper, strip_dots: bool = False) -> List[Dict[str, str]]:
	records = []
	current_record = {}
	key = None

	for line in input_stream:
		line = line.rstrip()

		if not line:
			if current_record:
				records.append(current_record)
				current_record = {}
			key = None
			continue

		if re.match(r'\s', line):
			if key is None:
				raise ValueError('Unexpected input: continuation line without a key')
			line = line[1:]
			if strip_dots and line.startswith('.'):
				line = line[1:]
			current_record[key] += '\n'
		else:
			key, value = line.split(':', 1)
			key = key.strip()
			if re.match(r'\s', value):
				value = value[1:]
			current_record[key] = value

	if current_record:
		records.append(current_record)

	return records

def read_csv(input_stream: TextIOWrapper, separator: str) -> List[Dict[str, str]]:
	reader = csv.DictReader(input_stream, delimiter=separator)
	records = [row for row in reader]
	return records

def write_csv(records: List[Dict[str, str]], fields: List[str], output_stream: TextIOWrapper, delimiter: str) -> None:
	writer = csv.DictWriter(output_stream, fieldnames=fields, delimiter=delimiter)
	writer.writeheader()
	writer.writerows(records)

def write_records(records: List[Dict[str, str]], output_stream: TextIOWrapper) -> None:
	for record in records:
		for key, value in record.items():
			output_stream.write(f'{key}: {value}\n')
		output_stream.write('\n')

def main():
	parser = argparse.ArgumentParser(description='Process Rec format and output TSV/CSV format or vice versa')
	parser.add_argument('-f', '--fields', type=str, help='comma-separated list of field names', required=True)
	parser.add_argument('--sep', type=str, default='\t', help='separator for input/output TSV/CSV format')
	parser.add_argument('--reverse', action='store_true', help='convert TSV/CSV to the original Rec format')
	parser.add_argument('-s', '--strip-dots', action='store_true', help='strip leading dots from continued lines, as in RFC 822')
	options = parser.parse_args()

	fields = options.fields.split(',')

	try:
		if options.reverse:
			records = read_csv(sys.stdin, sep=options.sep)
			write_records(records, sys.stdout)
		else:
			records = read_records(sys.stdin, options.strip_dots)
			write_csv(records, sys.stdout, fields, delimiter=options.sep)
	except Exception as e:
		raise e
		sys.stderr.write(f'Error: {e}\n')
		sys.exit(1)

if __name__ == '__main__':
	main()
