#!/usr/bin/perl -w

=head1 SYNOPSIS

expand.pl
	[--defsFile defs-file | --storedDefs]
	[--saveDefs]
	[--wordList | --immediate]
	[--utf8List]
	[--respellList]
	[--respellutf8List]
	[--defList]
	[--uDefList]
	[--text text-file]
	[--wordFilter]
	[--utf8Filter]
	[--defFilter]
	[--uDefFilter]
	[--uRhymeList]
	[--baseList]
	[--uBaseList]
	[--csv]

=head1 COPYRIGHT

Copyright © Raphael Finkel 2001-2009 raphael@cs.uky.edu  

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

=head1 DESCRIPTION

Morphological definitions are given in the defs-file or in a database saved by
a previous run of this program with the --saveDefs option.

These definitions are used to generate reports and to filter input (given by
--text).  If --text is missing, it is taken as stdin; a "--text -" parameter
also indicates stdin.

One may generate reports and filter input in the same run, but that is not
likely useful.

Reports:
	--wordList: show Romanization of all forms;
	--immediate: like --wordList, but output online (for interactive use)
	--utf8List: show UTF-8 encoding of all forms
	--uRhymeList: same as utf8List, but sorted by ending.
	--respellList: show all words that have non-phonetic spelling (in
		Romanization) in a format appropriate for the respell program
	--defList: show all definitions
	--uDefList: show all definitions using Unicode
	--baseList: like --wordList, but for each word also show base
	--ubaseList: like --baseList, but in Unicode
	--csv: build a comma-separated-value file: roman, yiddish, POS 

Filters:
	--wordFilter: show all words in the text file not generated by the defs-file.
	--utf8Filter: show all words in a utf8 text file not generated by the defs-file.
	--defFilter: echo the text, but add after each word a definition if known.
	--uDefFilter: same as defFilter, but for utf8 input.

=head1 METHOD

References to CY are to the Synopsis of Grammar at the end of College
Yiddish.

The following codes are used in the defs-file, prefixed by "/".

	D: diminutive
	N: plural in -n
	S: plural or possessive in -s
	A: adjective (with optional suffix)
	V: verb, with optional irregular infinitive
	I: noun, can take -ik or -dik or -edik
	H: Hebrew; needs separators around it
	P: preposition
	T: past tense in ge .. t
	G: separable prefix on verb, already conjugated to the past form (without
		ge)
	B: form of the past participle (including ge, if appropriate)
	C: name (needs dative, possessive)
	E: suffix
	J: -ns, as in bobens, beydns.
	K: positive, comparative, superlative adjective (can indicate new base)
	L: prefix
	-: nonexistent form, to be suppressed

Respellings are indicated by {internal codes} after a word or bar-delimited
word part.  The internal codes must not contain a space; use underscore (_)
instead.  Usually, hyphen is better.

=head1 BUGS

	/F seems to duplicate /G
	/I seems to duplicate /A
	akhuts{aHuc}/P doesn't obey the Hebrew spelling when adding -n

=cut

# initialized endings
	@DiminAfterVowel = ("le", "lekh");
	@DiminAfterCons = ("l", "ele", "lekh", "elekh");
	@Adjective = ("s", "er", "e", "n");
	@Verb = ("st", "stu", "t", "n");

# patterns
	$lengthenName = "(m|n|[bdfgkmnpstvz]l|khl|[ae]y|[iu])\\|?\$"; # CY A.3
	$lengthenInf = "(m|n[gk]?|(sh|kh|[bBdfgkmnpstvz])l|[oae]y|[iou])\\|?\$";
		# CY G.35.b

# initialized variables
	my $baseList = 'none'; # or 'text' or 'utext';
	my %csv = ();

use Encode 'decode_utf8';
use Data::Dumper;
use utf8;

keys %fullList = 50000; # hash containing all definitions
keys %baseList = 50000; # hash containing all base forms
my ($utf8) = 0;  # set to 1 in initialization if output is in utf8

sub emit { # a morphological form has been generated
	local ($theWord, $pos) = @_;
	$pos //= '?';
	# $pos is only used for csv style
	if ($baseList eq 'text') {
		$theWord =~ s/\|//g;
		$baseList{$theWord} //= {};
		${$baseList{$theWord}}{$wordBase} = 1;
	} elsif ($baseList eq 'utext') {
		my $internal = Internal_to_UTF(Yivo_2_Internal($theWord));
		my $uBase = Internal_to_UTF(Yivo_2_Internal($wordBase));
		$baseList{$internal} //= {};
		${$baseList{$internal}}{$uBase} = 1;
		# print STDERR "$uBase => $internal\n";
	} elsif ($baseList eq 'csv') {
		return if $pos eq '?';
		$uWord = Internal_to_UTF(Yivo_2_Internal($theWord));
		$theWord =~ s/\|//g;
		if (exists $csv{$theWord} and $csv{$theWord}{'base'} eq $wordBase) {
			$csv{$theWord}{'POS'} .= " $pos";
		} else {
			$csv{$theWord} = {};
			$csv{$theWord}{'romanized'} = $theWord;
			$csv{$theWord}{'utf'} = $uWord;
			$csv{$theWord}{'POS'} = $pos;
			$csv{$theWord}{'base'} = $wordBase eq '-' ? $theWord : $wordBase;
		}
	} # csv
	if ($theWord =~ / /) {
		print STDERR "BAD: [$theWord]\n"
	}
	if (defined($suppress{$theWord})) {
		# print "suppressing $theWord\n";
		return;
	}
	if ($theWord eq "\n") {
		print "\n" unless $#ARGV >= 1;
		return;
	}
	my $toPrint = $theWord;
	$toPrint =~ s/^\||\|$//g; # remove | fore and aft
	$toPrint = $spells{$theWord} if ($utf8 && defined($spells{$theWord}));
	if ($allArgs =~ /--immediate\b/) {
		print $toPrint;
		print " [$definition]" unless $definition eq "";
		print " ";
	}
	if (defined($fullList{$toPrint}) and $definition ne "") {
		$fullList{$toPrint} .= "; $definition"
			unless ($fullList{$toPrint} =~ /\b\Q$definition\E\b(ل?)/);
	} else {
		$fullList{$toPrint} = $definition;
	}
	if ($allArgs =~ /--(utf8Filter|uDefFilter)\b/i) {
		$internal = Internal_to_UTF(Yivo_2_Internal($toPrint));
		if (defined($knownUtf8{$internal})) {
			$knownUtf8{$internal} .= ", $definition"
				unless $knownUtf8{$internal} =~ /\b\Q$definition\E\b/;
		} else {
			$knownUtf8{$internal} = $definition;
			# print "I know [$internal]\n";
		}
	}
	return($theWord);
} # emit

my %adjSuffixPos = (
	'er' => 'ADJ.MASC.NOM.SG ADJ.FEM.DAT.SG',
	'e' => 'ADJ.FEM.NOM.SG ADJ.NEUT.NOM/ACC.DEF.SG ADJ.PL',
	's' => 'ADJ.NEUT.NOM.SG ADJ.NEUT.ACC.SG',
	'n' => 'ADJ.MASC.ACC.SG ADJ.MASC.DAT.SG ADJ.NEUT.DAT.SG',
); # adjSuffixPos

my %verbSuffixPos = (
	'' => 'V.1.SG',
	'st' => 'V.2.SG',
	'stu' => 'V.2.SG',
	't' => 'V.3.SG V.2.PL',
	'n' => 'V.INF V.1.PL V.3.PL'
); # verbSuffixPos

my %diminSuffixPos = (
	'le' => 'NOUN.DIM.SG',
	'l' => 'NOUN.DIM.SG',
	'ele' => 'NOUN.DIM.SG',
	'lekh' => 'NOUN.DIM.PL',
	'elekh' => 'NOUN.DIM.PL',
);

my %notePos = (
	'adj' => 'ADJ',
	'adj attributive' => 'ADJ.ATTR',
	'adj indeclinable' => 'ADJ',
	'adj predicative' => 'ADJ.SG.PRED',
	'adj predicative, undeclined' => 'ADJ.SG.PRED',
	'adv' => 'ADV',
	'article' => 'ART',
	'clause' => 'ADV-V.PAST',
	'conj' => 'CNJ',
	'conj correl' => 'CNJ.CORREL', # no standard way to say correlative
	'conj subordinating' => 'CNJ.SUBR', 
	'd' => 'NUM',
	'f/m' => 'NOUN.FEM/MASC',
	'f/n' => 'NOUN.FEM/NEUT',
	'f?' => 'NOUN.FEM?',
	'f' => 'NOUN.FEM',
	'interj' => 'EXCLAM',
	'm/f' => 'NOUN.MASC/FEM',
	'm/n?' => 'NOUN.MASC/NEUT?',
	'm/n' => 'NOUN.MASC/NEUT',
	'm?' => 'NOUN.MASC?',
	'm' => 'NOUN.MASC',
	'n/f' => 'NOUN.NEUT/FEM',
	'n/m' => 'NOUN.NEUT/MASC',
	'n?' => 'NOUN.NEUT?',
	'n' => 'NOUN.NEUT',
	'noun/adj' => 'NOUN',
	'noun dative' => 'NOUN.DAT',
	'noun' => 'NOUN',
	'?' => 'NOUN', # of unknown gender
	'participle' => 'PART',
	'particle' => 'PTCL',
	'prefix' => 'PREFIX', # non-standard
	'prep' => 'PREP',
	'pronoun (objective)' => 'PN.ACC',
	'pronoun' => 'PN',
	'pronoun (possessive)' => 'PN.POSS',
	'pronoun (subjective)' => 'PN.NOM',
	'verb' => 'V',
);

sub doAdjective {
	local ($theBase, $special) = @_;
	$special //= ''; # might be .COMP or .SUPL or .PART
	local($base);
	$theBase = $spells{$theBase} if ($utf8 && defined($spells{$theBase}));
	foreach $suffix (@Adjective) {
		my $pos = $adjSuffixPos{$suffix};
		# print STDERR "special: $special\n";
		if ($special eq '.PART') {
			$pos =~ s/\bADJ\b/PART/g;
			# print STDERR "the pos is now $pos\n";
		} else {
			$pos .= $special;
		}
		if ($theBase =~ /(ar|or|oyr|[^aeiouyr|])n$/ && $suffix =~ /^(e|n)/) {
			# eg: gevezn + er => gevezener; gevezn + n => gevezenem
			# CY C.13: supporting (epenthetic) "e".
			# I added case r for modern + er => moderner, not moderener
			# likewise hiltserner, glezerner.
			#   but derfarener, farloyrener, geborener
			$theBase =~ /(.*)n$/;
			$base = $1 . "en";
		} else {
			$base = $theBase;
		}
		if ($suffix eq "n") {
			if ($base =~ /(.*)e$/) { # this rule seems wrong. 1/2013
				# eg: rozeve + n => rozevn
				doAddN($1, "(oy|i|m|m\\w)\$", "(ne|ay|n)\$", $pos);
			} else {
				# CY C.11
				doAddN($base, "([oae]y|[ioum])\$", "(^nay|ne|n)\$", $pos);
			}
			# eg: bloyen, frien; yenem, nayem
		} elsif ($suffix eq "s" && $base =~ /(s|st)$/) {
			# CY C.12 (covers "s", not "st")
			# eg: groys + s, alt|st + s
		} elsif ($suffix eq "s" && $base =~ /t$/) {
			# eg: geshvitst|s, gezukhts
			emit($base . "|" . $suffix, $pos);
		} elsif ($suffix =~ "^e" && $base =~ /e$/) {
			# eg: azoyne + er
			# ignore an extra -e ending
			$shortbase = $base; # make a copy
			$shortbase =~ s/e$//;
			emit($shortbase . $suffix, $pos);
		} elsif ($suffix eq "s" && $base =~ /(sh|i)$/) {
			emit($base . "es", $pos);
			# eg: englishes, fries
		} else {
			emit($base . $suffix, $pos);
		}
	}
} # doAdjective

sub doAddN {
	local ($base, $forEn, $forEm, $pos) = @_;
	$pos //= 'UNK3';
	$base = $spells{$base} if ($utf8 && defined($spells{$base}));
	$shortbase = $base; # make a copy
	$shortbase =~ s/e$//; # remove any final e, but not from "ze"
	# eg: yene + em => yenem
	if ($forEm ne "" && $base =~ /$forEm/) {
		return(emit($shortbase . "em", $pos));
	} elsif ($base =~ /$forEn/) {
		return(emit($shortbase . "en", $pos));
	} else {
		return(emit($base . "n", $pos));
	}
} # do AddN

sub joinUp { # join two strings, separating with | if necessary
	my ($first, $second, $shtum) = @_;
	$first = $spells{$first} if ($utf8 && defined($spells{$first}));
	$second = $spells{$second} if ($utf8 && defined($spells{$second}));
	# print "joining $first to $second\n";
	if  ("$first\01$second" =~ /s\01h|k\01h|t\01s|z\01h|e\01y/) {
		return($first . '|' . $second);
	} elsif ($shtum and $second =~ /^(oy|ey|ay|i)/) {
		return($first . '#' . $second); # shtumer alef
	} else {
		return($first . $second);
	}
} # joinUp

sub ToUTF {
	local($left, $right) = @_;
	$mychar = ord($left) * 256 + ord($right);
	if ($mychar <= 0x7F) { # 7 sig bits; plain 7-bit ascii
		return chr $mychar;
	} elsif ($mychar <= 0x7FF) { # 11 sig bits; Hebrew is in this range */
		$first = chr(0300 | ($mychar >> 6)&037);
		$second = chr(0200 | $mychar & 077);
		return "$first$second";
	} elsif ($mychar <= 0xFFFF) { # 16 sig bits */
		$first = chr(0340 | (($mychar >> 12)&017));
		$second = chr(0200 | (($mychar >> 6)&077));
		$third = chr(0200 | ($mychar & 077));
		return "$first$second$third";
	}
} # ToUTF

sub Internal_to_UTF {
	my ($text) = @_;
	use utf8;
	$text =~ tr{#bgdhwzHtyXxlMmNnseFPCckrSTOv-ʼ}
	           {אבגדהוזחטיךכלםמןנסעףפץצקרשתױװ־'};
	$text =~ s/a/אַ/g;
	$text =~ s/B/בֿ/g;
	$text =~ s/K/כּ/g;
	$text =~ s/f/פֿ/g;
	$text =~ s/p/פּ/g;
	$text =~ s/Q/שׂ/g;
	$text =~ s/W/תּ/g;
	$text =~ s/I/ײַ/g;
	$text =~ s/A/ײ/g;
	$text =~ s/i/יִ/g;
	$text =~ s/o/אָ/g;
	$text =~ s/u/וּ/g;
	$text =~ s/ʼ/'/g; # not caught by tr above
	$text =~ s/-/־/g; # makef
	return $text;
	no utf8;
} # Internal_to_UTF

sub Yivo_2_Internal {
	my ($text, $noRespell) = @_;
	@respelled = ();
	if (defined($noRespell)) { # avoid looking up spellings
		$text =~ s/_//g;
		$text = lc($text);
	}
	foreach $part (split /\|/, $text) {
		# print "part $part, spelled $spells{$part}\n";
		push @respelled, defined($spells{$part}) && !defined($noRespell)
			? $spells{$part} : $part;
	}
	$text = join("|", @respelled);
	# $text =~ s/\bfar(ey|in|ib|um)/far#$1/g; # "fareynik"
	# $text =~ s/\bfarur/far#ur/g; # ur has to have a shtumer alef here
	# $text =~ s/\bur(ey)/ur#$1/g; # "ureynikl"
		# maybe need a rule for geiblt, which also needs a shtumer alef?
    $text =~ s/ay/I/g; # pasakh tsvey-yud
    $text =~ s/ey/A/g; # tsvey-yud
	$text =~ s/oy/O/g; # vov-yud
	$text =~ s/u/w/g; # vov
	$text =~ s/iy/Jy/g; # khirik-yud yud
	$text =~ s/vw/vu/g; # tsvey-vov melupm-vov
	$text =~ s/wv/uv/g; # melupm-vov tsvey-vov
	$text =~ s/ww/uw/g; # melupm-vov vov
	$text =~ s/wy/Uy/g; # melupn-vov yud
	$text =~ s/wi/wJ/g; # vov khirik-yud
	$text =~ s/(^|[^\w#~|]|_)([OAIiuw])/$1#$2/g; # shtumer alef
	$text =~ s/ie/Je/g; # khirik-yud ayin
	$text =~ s/ii/JJ/g; # khirik-yud khirik-yud
	$text =~ s/iw/Jw/g; # khirik-yud vov
	$text =~ s/Oi/OJ/g; # vov-yud khirik-yud
	$text =~ s/([aAeIoOu])i/$1J/g; # vowel khirik-yud
	$text =~ s/i([aAeIoOu])/J$1/g; # khirik-yud vowel
	$text =~ s/i/y/g; # yod
	$text =~ s/J/i/g; # khirik-yud
	$text =~ s/yy/|yi/g; # yud khirik-yud
	$text =~ s/kh/x/g; # khof
	$text =~ s/tsh/tS/g; # tes shin
	$text =~ s/ts/c/g; # tsadik
	$text =~ s/(\w|[#'"\x80-\x91]|\|)([nxfmc])($|[^\w^#~|\x80-\x91]|_)/$1\u$2$3/g; # final letters
		# must be at least a 2-letter word 
	$text =~ s/(\w[kh])It(N?)\b/$1At$2/g; # xxxkayt => xxxkeyt
	$text =~ s/sh/S/g; # shin
	$text =~ s/zh/zS/g; # zayin shin
	$text =~ s/dj/dzS/g; # daled zayin shin
	$text =~ s/\|//g; # remove prophylactic  |
	$text =~ s/J/i/g; # explicit pintl-yud
	$text =~ s/U/u/g; # explicit melupm-vov
	$text =~ s/V/w/g; # explicit single vov
	return($text);
} # Yivo_2_Internal

sub getData {
	if ($allArgs =~ /--storedDefs\b/) {
		# print "Using stored defs.\n";
		# tie %i2y, 'DB_File', 'i2y.data', O_RDWR|O_CREAT, 0664, $DB_HASH;
		tie %spells, 'DB_File', 'spells.data', O_RDWR|O_CREAT, 0664, $DB_HASH;
		tie %fullList, 'DB_File', 'defs.data', O_RDWR|O_CREAT, 0664, $DB_HASH;
		return;
	}
	my ($fileName);
	$allArgs =~ /--defsFile ([-\w.\/]+)/ or usage();
	$fileName = $1;
	if ($fileName eq "-") {
		$DATAFILE = *STDIN;
	} else {
		open INPUT, "<:utf8", $fileName || die "Can't open $fileName; stopping";
		$DATAFILE = *INPUT;
	}
	local $wordBase;
	while (defined($line = <$DATAFILE>)) { # one line of dictionary input
		$definition = "" unless $line =~ /^\s/;
		# $line =~ s/\[[^]]*\]//g; # remove definitions, spelling annotations
		$line =~ s/%.*//; # remove comments
		next if ($line =~ /^\s*$/); # blank line; ignore
		# print "line: $line";
		@parts = ();
		while ($line =~ s/^\s*([-\w#\|\/{}'ʼ"״″~ ]+|\[[^]]*\])//) {
			# the space in the pattern is non-breaking 00a0
			# one unit or signif. cmnt
			# print STDERR "pushing $1\n";
			push @parts, $1;
		}
		# print "\tparts: ", join(', ', @parts), "\n";
		@newparts = ();
		$localdef = undef;
		my $pos = '?'; # unless determined otherwise
		foreach $unit (reverse @parts) { # backwards to get defs first
			$unit =~ s/'//g; # remove accent; we ignore it
			if ($unit =~ /\[(.*)\]/) { # significant comment
				my $comment = $1;
				# print "significant: $unit\n";
				if ($comment =~ /def: ([^]]*)/) {
					push @newparts, $localdef if defined($localdef);
					$localdef = $unit;
				} elsif (exists $notePos{$comment}) {
					$pos = $notePos{$comment};
				} elsif ($comment =~ /^\w+:/) {
					# ignore
				} else {
					print STDERR "Unknown unit: [$comment]\n";
				}
			} else {
				push @newparts, $unit;
			}
		} # backwards sweep through @parts
		push @newparts, $localdef if defined($localdef);
		@parts = reverse @newparts;
		# print "after shifting: ", join(", ", @parts), "\n";
		foreach $unit (@parts) { # one unit
			# print STDERR "\tunit: $unit\n";
			if ($unit =~ /\[([^\]]*)\]/) { # significant comment
				my $comment = $1;
				if ($comment =~ /def: (.*)/) {
					$definition = $1;
					# print STDERR "\tnew def: $localdef\n";
				} else {
					print STDERR "comment: $comment\n";
				}
			}
			$unit =~ /([-\w|{}#ʼ״″~]*)(.*)/; 
			$rest = $2;
			if ($1 ne "") { # new base
				$base = removeBraces($1);
				$participle = "";
				$wordBase = $base;
				# emit($base, $pos) unless $base eq '-';
				emit($base, $pos); # if $pos ne '?';
				# print STDERR "New base: $base\n";
			} # new base
			$base //= ""; # for very start of file with /
			while ( $rest =~ s/^\/([-\w|{}#]+)//) { # one code
				$code = $1;
				# print "code $code\n";
				decode: {
					if ($code =~ "^E") { # ending
						$pos = 'NOUN';
						$code =~ /E(.+)/;
						$suffix = $1;
						emit(joinUp($base, $suffix, 0), "$pos.PL");
						last decode;
					} elsif ($code =~ /^X(.*)/) { # plural form
						$pos = 'NOUN';
						$pluralForm = removeBraces($1);
						if ($pluralForm ne "") {
							# print STDERR "Plural: $base -> $pluralForm\n";
							emit($pluralForm,, "$pos.PL");
						} else {
							# print STDERR "No plural form for $base\n";
						}
						last decode;
					} elsif ($code =~ "^B") { # participle form
						$code =~ /B(.+)/;
						$participle = removeBraces($1);
						emit($participle, 'PART');
						doAdjective($participle, '.PART');
						last decode;
					} elsif ($code eq "H") { # Hebrew
						$base = '|' . $base . '|';
						last decode;
					} elsif ($code =~ /^D/) { # diminutives
						if ($code =~ /^D(.+)/) {
							$base = removeBraces($1);
						}
						if ($base =~ /[aeiou](\|)?$/) {
							foreach $suffix (@DiminAfterVowel) {
								my $pos = $diminSuffixPos{$suffix};
								emit(joinUp($base, $suffix, 0), $pos);
							}
						} else {
							foreach $suffix (@DiminAfterCons) {
								my $pos = $diminSuffixPos{$suffix};
								emit(joinUp($base, $suffix, 0), $pos);
							}
						}
						last decode;
					} elsif ($code eq "C") { # name
						if ($base =~ /^(tat|zeyd)e$/) { # special dative
							# CY A.4
							$root = $1;
							doAddN($root, "", "", 'N.DAT'); # dative
						} else { # ordinary dative
							if (defined($spells{$base})) {
								$base .= '|';
							}
							doAddN($base, $lengthenName, "", 'N.DAT');
						}
						if ($base =~ /(s|ts|sh|z)$/) {
							# CY A.5
							emit(joinUp($base, 'es', 0), 'N');
						} else { # unless -s, last syllable accent: use '
							emit(joinUp($base, 's', 0), 'N');
						}
						last decode;
					} elsif ($code eq "J") { # -ns
						# possessive form
						my $pos = 'NOUN.POSS';
						if ($base =~ "(.*)e\$") {
							# eg: tatn
							$root = $1;
							if ($root =~ /(ndl|n|ng|oy|ay|m|m\w|nk)$/) {
								# eg: mamens
								emit(joinUp($root, "ens", 0), $pos); 
							} else {
								# eg: tatns
								emit(joinUp($root, "ns", 0), $pos); 
							}
						} else {
							if ($base =~ /(ndl|n|ng|oy|ay|m|m\w|nk)\$/) {
								emit(joinUp($base, "ens", 0), $pos); 
							} else {
								# eg: yidns
								emit(joinUp($base, "ns", 0), $pos); 
							}
							# eg: Sheyndl + n => Sheyndlen
						}
						last decode;
					} elsif ($code =~ "^A(.*)") { # adjective
						emit($base,
							 'ADJ.PRED.SG ADJ.NEUT.NOM/ACC.INDF.SG');
						if ($1 ne "") {
							doAdjective($1);
						} else {
							doAdjective($base);
						}
						last decode;
					} elsif ($code =~ /^K(.*)/) { # positive, comparative, superlative
						emit($base, 'ADJ.SG.PRED');
						if ($1 eq "") { # standard, eg: flink, flinker, flinkster
							$comparative = $base;
						} else { # exceptional, eg: gut, beser, bester
							$comparative = $1;
							emit(joinUp($comparative, "er", 0), 'ADJ.COMP');
						}		
						doAdjective($base);
						# comparative: CY C.14.a
						doAdjective($comparative . "er", '.COMP');
						# superlative: CY C.14.b; requires case, gender ending
						if ($comparative =~ /s$/) {
							# eg: zis + ster => zister; CY C.14.b
							doAdjective($comparative . "t", '.SUPL');
						} else {
							doAdjective(joinUp($comparative, "st", 0), '.SUPL');
						}
						last decode;
					} elsif ($code =~ /I(.*)/) { # word taking -ik; explicit option
						if ($1 ne "") {
							$ending = $1;
						} elsif ($base =~ /(l|kht|d|ns|er|ts|tst)$/) {
							# eg: amoliker
							$ending = "ik";
						} elsif ($base =~ /(sh|es|em|l|kh|n)$/) {
							# eg: khoydesh, shabes, sholem, shul, peysekh,
							$ending = "dik";
						} else {
							$ending = "edik";
						}
						$most = emit(joinUp($base, $ending, 0), 'ADJ');
						doAdjective($most);
						last decode;
					} elsif ($code =~ /^V(.*)/) { # verb, optional infinitive
						$infinitive = removeBraces($1);
						if ($1 ne "") { # explicit infinitive
							emit($infinitive, 'V.INF');
						}
						# print "setting infinitive to $infinitive\n";
						foreach $suffix (@Verb) {
							my $pos = $verbSuffixPos{$suffix};
							# CY G.35
							if ($suffix eq "n") {
								local($newBase) = $base;
								if ($base =~ /(.*[^aeiouy])n$/) {
									# eg: efn + n => efenen
									# epenthetic e
									$newBase = $1 . "en";
								}
								if ($infinitive eq "") {
									$infinitive =
										doAddN($newBase, $lengthenInf, "", $pos);
									# print "setting infinitive to $infinitive\n";
								} else {
									doAddN($newBase, $lengthenInf, "", $pos);
								}
								# College Yiddish p. 55
							} elsif ($base =~ /(?<!t)s$/ && $suffix =~ /^s/) {
								# CY G.35.e: fold s with s
								$new = $suffix; # must copy; $suffix is in @Verb
								$new =~ s/^s//;
								emit(joinUp($base, $new, 0), $pos);
							} elsif ($base =~ /t[|]?$/ && $suffix =~ /^t/) {
								# CY G.35.d: fold t with t
								$new = $suffix; # must copy; $suffix is in @Verb
								$new =~ s/^t//;
								emit(joinUp($base, $new, 0), $pos);
							} else {
								emit(joinUp($base, $suffix, 0), $pos);
							}
						}
						$most = joinUp($infinitive, "dik", "", 0);
						emit($most, 'V.PRES.PTCP');
						doAdjective($most);
						last decode;
					} elsif ($code eq "T") { # verb in ge .. t
						local($newBase);
						if ($base =~ /^ge(.*)/) {
							# eg: gefel
							$newBase = $1;
						} else {
							$newBase = $base;
						}
						if ($newBase =~ /t[|]?$/) {
							$participle = joinUp('ge', $newBase, 1);
							emit($participle, 'PART');
							doAdjective($participle, '.PART');
						} else {
							$participle = joinUp('ge',
								 joinUp($newBase, 't', 0), 1);
							emit($participle, 'PART');
							doAdjective($participle);
						}
						last decode;
					} elsif ($code =~ /^L(.+)/ ) { # prefix
						$prefix = $1;
						emit(joinUp($prefix, $base, 1), 'ADV');
						last decode;
					} elsif ($code =~ "^G") { # separable prefix on past participle
						$code =~ /G(.*)/;
						$prefix = $1;
						# print "infinitive $infinitive \n";
						emit(joinUp(joinUp($prefix, "tsu", 0), $infinitive, 1),
							'V.INF');
						$most = emit(joinUp($prefix, $infinitive, 1), 'V.INF');
						emit($most . "dik", 'PRES.PTCP');
						emit($prefix . $participle, 'UNK1');
						doAdjective($most . "dik");
						doAdjective($prefix . $participle);
						last decode;
					} elsif ($code eq "N") { # plural in -n
						$pos = 'NOUN';
						doAddN($base, $lengthenInf, "", "$pos.PL");
						# eg: eplen, zunen, tsaytungen, froyen, bekerayen
						last decode;
					} elsif ($code eq "P") { # preposition
						my $pos = 'PREP';
						# CY B.8
						if ($base =~ /n$/) {
							if ($base =~ /(.*[^aeiouy])([nl])$/) {
								# eg: tsvishn + m => tsvishenem
								emit($1 . "e" . $2 . "em", $pos);
							} else {
								emit($base . "em", $pos);
							}
						} elsif ($base =~ /(u|y)$/) {
							# eg: tsum, baym
							emit($base . "m", $pos);
						} else {
							emit($base . "n", $pos);
						}
						last decode;
					} elsif ($code eq "S") { # plural in -s
						if ($base =~ /(s|sh)$/) {
							emit($base . "es", "$pos.PL");
						} else {
							emit(joinUp($base, 's', 0), "$pos.PL");
						}
						last decode;
					} elsif ($code =~ /^-(.*)/) { # form to suppress
						$base = $1;
						$suppress{$base} = 1;
						# print "Will suppress $base\n";
					} else {
						print STDERR "Cannot decode $code\n";
					}
				} # decode loop
			} # deal with one code
		} # one unit
		print "\n" if ($allArgs =~ /--immediate\b/);
	} # foreach line of dictionary input
	# $stat = "" . %fullList;
	# print "done; hash has $stat\n";
	if ($allArgs =~ /--saveDefs\b/) {
		print "about to save defs\n";
		unlink 'spells.data', 'defs.data';
		# tie %i2y, 'DB_File', 'i2y.data', O_RDWR|O_CREAT, 0664, $DB_HASH;
		tie %saveSpells, 'DB_File', 'spells.data', O_RDWR|O_CREAT, 0664, $DB_HASH;
		tie %saveFullList, 'DB_File', 'defs.data', O_RDWR|O_CREAT, 0664, $DB_HASH;
		%saveSpells = %spells;
		%saveFullList = %fullList;
		print "finished saving defs\n";
	}
} # getData

sub removeBraces {
	my ($unit) = @_;
	my ($romanized, $respelling, $header);
	@results = ();
	$unit =~ s/\~/ /g; 
	# print "unit is $unit\n";
	for $piece (split /}/, $unit) { # one piece
		if ($piece =~ /(.*)\{([^}]*)/) { # spelling rule
			# print STDERR "working on $piece}:";
			$romanized = $1;
			$respelling = $2;
			$header = "";
			if ($romanized =~ /(.*)\|([^|]*)/) {
				$header = $1;
				$romanized = $2;
			}
			# print STDERR "after romanization: ($header , $romanized)\n";
			$romanized =~ s/^-//g;
			if ($romanized ne '' and defined($spells{$romanized}) and
					$spells{$romanized} ne $respelling) {
				print STDERR "$romanized is spelled both as ",
					$spells{$romanized}, " and as $respelling\n";
			}
			$spells{$romanized} = $respelling;
			# print STDERR "I see that you spell $romanized $respelling\n";
			push @results, $header . $romanized;
			# print STDERR " got ", $header . $respelling, "\n";
		} else {
			push @results, $piece;
		}
	} # one piece
	return (join("", @results));
} # removeBraces

sub doWork {
	return if ($allArgs =~ /--saveDefs/);
	if ($allArgs =~ /--text (-|\S+)/) {
		$fileName = $1;
		if ($fileName eq "-") {
			$TEXTFILE = *STDIN;
		} else {
			open INPUT, "<:utf8", $fileName || die "Can't open $fileName; stopping";
			$TEXTFILE = *INPUT;
		}
	} else {
		$TEXTFILE = *STDIN;
	};
	if ($allArgs =~ /--wordList\b/) {
		# print "All words\n";
		while (($form,undef) = each %fullList) {
			print "$form\n" unless defined($suppress{$form});
			$suppress{$form} = 0; # we don't want duplicates
		}
	};
	if ($allArgs =~ /--(uB|b)aseList\b/) {
		my $expansion;
		for my $word (sort keys %baseList) {
			print "$word: " . join(' ', keys %{$baseList{$word}}) . "\n";
		}
	};
	if ($allArgs =~ /--defList\b/) {
		while (($form,$def) = each %fullList) {
			print "$form\t$def\n"
				unless defined($suppress{$form}) or $def eq "";
		}
	}
	if ($allArgs =~ /--uDefList\b/) {
		while (($form,$def) = each %fullList) {
			next if $suppress{$form};
			next if $def eq "";
			$internal = Yivo_2_Internal($form);
			print Internal_to_UTF($internal), "\t", $def, "\n";
		}
	}
	if ($allArgs =~ /--wordFilter\b/) {
		while (defined($line = <$TEXTFILE>)) { # one line of text input
			if ($line =~ /\\/) {next;} # don't spellcheck TeX lines
			while ($line =~ s/([\w|#]+)//) { # one word of input
				$word = $1;
				if (!defined $fullList{$word} or defined($suppress{$word})) {
					print $word , "\n"; # wasn't found
				}
			} # one word of input
		} # one line of text input
	}
	if ($allArgs =~ /--utf8Filter\b/i) {
=head1
		while (($form,undef) = each %fullList) {
			next if $suppress{$form};
			$knownUtf8{Internal_to_UTF(Yivo_2_Internal($form))} = $definition;
			print "I know [", Internal_to_UTF(Yivo_2_Internal($form)), "]\n";
		}
=cut
		# print "known: ", join(", ", keys %knownUtf8), "\n";
		while (defined($line = <$TEXTFILE>)) { # one line of text input
			$line =~ s/[][.,?:;*'"!()„“־׳-]/ /g;
			# print "line is $line\n";
			foreach $word (split /\s+/, $line) {
				# one word of input
				next if ($word =~ /^\d+$/);
				next if ($word =~ /^$/);
				if (!defined($knownUtf8{$word})) {
					print "$word", "\n";
					$knownUtf8{$word} = 2;
					# print "known: ", join(", ", keys %knownUtf8), "\n";
				} else {
					# print "I knew $word\n";
				}
			} # one word of input
			no utf8;
		} # one line of text input
	}
	if ($allArgs =~ /--defFilter\b/) {
		while (defined($line = <$TEXTFILE>)) { # one line
			$line =~ s/([\w|#]+)/defined($fullList{$1}) ? "$1 ($fullList{$1})" : "$1"/eg;
			print $line;
		}
	}
	if ($allArgs =~ /--uDefFilter\b/) {
		# print "doing uDefFilter\n";
		$alphabet = "אבגדהוזחטיךכלםמןנסעףפץצקרשתױװאַבֿכּפֿפּשׂתּײַײיִאָוּ";
		while (defined($line = <$TEXTFILE>)) { # one line
			$line =~ s/([$alphabet]+)/defined($knownUtf8{$1}) ? "$1 \/$knownUtf8{$1}\/" : "$1"/eg;
			print $line;
		}
	}
	if ($allArgs =~ /--respellList\b/) {
		print "Alphabetical.  ",
			"Use internal form for RHS (without final forms).\n";
		print "All respelled Hebrew words\n";
		foreach $spell (sort (keys %spells)) {
            # my $toPrint = decode_utf8($spells{$spell});
            my $toPrint = $spells{$spell};
			printf "$spell %s\n", $toPrint
                 if defined($toPrint); # internal apostrophe prevents
		}
	}
	if ($allArgs =~ /--respellutf8List\b/) {
		foreach $spell (keys %spells) {
            my $respell = $spells{$spell};
			next unless defined $respell;
			$respell =~ s/x$/X/;
			$respell =~ s/m$/M/;
			$respell =~ s/n$/N/;
			$respell =~ s/c$/C/;
			$respell =~ s/f$/F/;
			printf "%s %s\n", Internal_to_UTF(Yivo_2_Internal($spell, 1)),
				Internal_to_UTF($respell);
		}
	}
	if ($allArgs =~ /--utf8List\b/) {
		while (($form,undef) = each %fullList) {
			# print Yivo_2_Internal($form), ": ";
			next if $suppress{$form};
			$internal = Yivo_2_Internal($form);
			# $internal = $form; # debug
			next if defined($emitted{$internal}); # suppress duplicates
			$emitted{$internal} = 0;
			# $i2y{$internal} = $form;
			# print Internal_to_UTF($internal), " $form\n";
			print Internal_to_UTF($internal), "\n";
		}
	}
	if ($allArgs =~ /--uRhymeList\b/) {
		@words = ();
		while (($form,undef) = each %fullList) {
			# print Yivo_2_Internal($form), ": ";
			next if $suppress{$form};
			$suppress{$form} = 1; # no need for duplicates
			$form =~ s/_//g;	
			push @words, scalar reverse($form);
			# $internal = Yivo_2_Internal($form);
			# $i2y{$internal} = $form;
			# push @words, myReverse(Internal_to_UTF($internal));
		}
		foreach $word (sort @words) {
			# print myReverse($word), "\n";
			print scalar reverse($word), "\t",
				Internal_to_UTF(Yivo_2_Internal(scalar reverse($word))), "\n";
		}
	} # uRhymeList
	if ($allArgs =~ /--csv\b/) {
		print "Base,Romanized,Yiddish,Gloss,Definition\n";
		for my $entry (sort {
					${$a}{'base'} cmp ${$b}{'base'}
				} values %csv) {
			# print STDERR Dumper($entry);
			my $base = ${$entry}{'base'};
			next if $base eq '-'; # not a real base
			my $romanized = ${$entry}{'romanized'};
			my $def = $fullList{$romanized} // 'no definition';
			$def =~ s/"//g; # avoid " inside csv
			printf "%s,%s,%s,%s,\"%s\"\n",
				$base,
				$romanized,
				${$entry}{'utf'},
				${$entry}{'POS'},
				$def;
		} # each entry
	} # csv
} # doWork

sub usage {
	print STDERR
		"Usage: $0\n",
		"\t[--defsFile defs-file | --storedDefs]\n" ,
		"\t[--saveDefs]\n",
		"\t[--wordList | --immediate]\n",
		"\t[--utf8List]\n",
		"\t[--uRhymeList]\n",
		"\t[--respellList]\n",
		"\t[--respellutf8List]\n",
		"\t[--defList]\n",
		"\t[--uDefList]\n",
		"\t[--text text-file]\n" ,
		"\t[--wordFilter]\n",
		"\t[--utf8Filter]\n",
		"\t[--defFilter]\n",
		"\t[--uDefFilter]\n",
		"\t[--baseList]\n",
		"\t[--uBaseList]\n",
		"\t[--csv]\n",
		;
	exit(1);
} # usage

sub myReverse {
	# reverse utf-8 with composing characters together with base characters
	my ($word) = @_;
	# print "[$word]\n";
	if ($word =~ /^((\p{Ll}|\p{Lo}|\p{Lu}|\p{Lt})\p{Mn}*)(.*)((\p{Ll}|\p{Lo}|\p{Lu}|\p{Lt})\p{Mn}*)$/) {
		my ($first, $middle, $last) = ($1, $3, $4);
		$word = $last . myReverse($middle) . $first;
	} else {
		# print "not found\n";
	}
	# print "<$word>\n";
	return($word);
} # myReverse

# main program here
usage() if ($#ARGV == -1);
$allArgs = join(' ', @ARGV);
$utf8 = ($allArgs =~ /utf8/);
$baseList = 'text' if $allArgs =~ /--baseList\b/;
$baseList = 'utext' if $allArgs =~ /--uBaseList\b/;
$baseList = 'csv' if $allArgs =~ /--csv\b/;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
getData();
doWork();
# untie %i2y;
untie %spells;
untie %fullList;
