package Ling::RuleEng;

=head1 NAME

Ling::RuleEng - Perl 5 module to provide a Chomsky-type rule engine

=head1 DESCRIPTION

This module implements a variation on the context-based rule invocation
described by Chomsky and Halle in ``The Sound Pattern of English''.
It uses the OOP (Object Oriented Programming) interface introduced with perl5.

=head1 INSTALLATION

This file should be copied to:

  <perl5lib>/Ling/RuleEng.pm

where  <perl5lib>  is your local perl5 library directory, usually
/usr/local/lib/perl5.  You'll likely have to create the Ling
directory, as the perl5 distribution doesn't include it yet.

You could then run the following to create a man page:

  cd /usr/local/lib/perl5/Ling
  pod2man RuleEng.pm > /usr/local/man/man1/RuleEng.1
  chmod a+r /usr/local/man/man1/RuleEng.1

=head1 USAGE

To use it in your programs, you can use either:

  use Ling::RuleEng;

or

  require Ling::RuleEng;

RuleEng exports nothing.

A new RuleEng object must be created with the I<new> method.  Once
this has been done rules are accessed through this object.

Here is a short example: 

  #!/usr/local/bin/perl -w
 
  use Ling::RuleEng;
 
  $c = new Ling::RuleEng 'en2';

  $word = "test";
 
  $transcription = ($c->trans($word));
 
  print "$transcription\n";
 
  __END__

The rules are stored in the file ``en2'' in this case, and loaded
as part of the object creation. 

=head2 RuleEng Commands

There are only two methods that are 
normally used by programs. 

=over 10

=item I<new>

Use this to create a new RuleEng instance. It takes two arguments,
a rule filename and a debug flag.  It calls I<read_rules>.  A valid
filename should always be specified. An empty argument can be used
to specify debug.

Example:

  $c = new Ling::RuleEng("welsh");
or
  $c = new Ling::RuleEng("english.australian",2);

Returns a blessed reference, representing a new RuleEng instance.

=item I<trans>

Provides transcriptions of strings provided as arguments. May compress
the data depending on the status of the STYLE flag. Calls I<find_match>
(not described here) to do the processing.

=back

=head1 RULES FILES

A rules file specifies the manipulations to be carried out on the input
data. There are a series of directives, some optional, followed by the
keyword RULES, followed by a set of rules. The character # deontes a
comment as in Perl (always??). The directives allowed are DIRECTION
(not implemented), STYLE, either compact or expanded and CLASS, for
specifying classes of characters. In more detail...

The rule format is: 

    left context (optional) [[ focus ]] right context (optional) -> output

The focus is allowed to be null, but must not contain any regular
expressions.  The contexts are allowed to be regular expressions,
identically equal to those permitted in Perl.



Example:

    DIRECTION LeftToRight
    STYLE Compact
    RULE
    #
    # Semiconsonants
       [[ i ]] [']?[aeou] -> jj
       [[ u ]] [']?[aeio] -> w
    # Vowels
       [[ a ]]            -> a
       [[ e ]]            -> e
       [[ i ]]            -> i
       [[ o ]]            -> o
       [[ u ]]            -> u

=head1 BUGS

The DIRN directive is not implemented. Only right to left processing is 
permitted.

The debugging flag is not tied to any useful debugging information.

=head1 TODO

Improve this documentation.  Make debugging in the _add_defaults file
more explicit. Improve the debugging. Improve what happens on failure.


=head1 AUTHOR

Alistair Conkie  <adc@cstr.ed.ac.uk>

=head1 SOURCE

The latest version may be retrieved by sending mail to:

  <adc@cstr.ed.ac.uk>

=head1 COPYRIGHT

Copyright (c) 1995 Alistair Conkie. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut

require 5.001;

use Carp;

$version = '$Revision: 1.7 $';

# DATA

@acceptable_symbol_table_chars_are =  split(//,
	'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ');


%symbol_table = (
	']',']',
	'[','[',
	'(','(',
	')',')',
	'*','*',
	'+','+',
	'?','?',
	'|','|',
	'\\','\\',
	'{','{',
	'}','}',
	'^','^',
	'$','$',
	# more ? hopefully not now.
);

sub new { 
	my $name = shift;
	my $filename = shift;
	my $debug = shift;

	my $me = bless {
		### Something for the symbol table setting
		DEBUG => defined ($debug) ? $debug : 1,
		STYLE => '',
		DIRN => '',
		SYMTAB => { %symbol_table },
		REVSYMTAB => {},
		# I THINK class is not needed here??
		RULES => {},
		NRULES => 0,
	} , $name;


	# initialisation
	$me->_read_rules($filename) ;


	$me;
}

sub trans {
	my $me = shift;
	my $input = shift;
	my $j = '';
	my $k = '';

	$j = $me->_compactify_input($input);

	($k,$output) = $me->_find_match($j);

	# and maybe do some Carping
	
	$output;
}



# SUBROUTINES

sub _read_rules {
	my $me = shift;
	my $filename = shift;
	my $n = 0;
	
	unless(open(RULES,"$filename")) {
		print STDERR "Can't open filename: $!\n";
		# croak instead??
		return(0);
	}

	$reading_rules = 0;

	# rule counter
	while($in = <RULES>) {
		chop;
		$in =~ s/([^\\]|)\#.*/$1/;	# Eliminates everything after unescaped #
		if($in =~ /^\s*$/) {		# Eliminates blank lines
			next;
		}
		if($in =~ /^RULE[S]?$/)  {
			if($reading_rules == 0) {
				$reading_rules = 1;
				%class = ();	# So sep. objects have sep. class space
				next;
			} else {
				print STDERR "RULES appears too often\n";
				return(0);
			}
		} elsif($in =~ /^CLASS/) {
			if($reading_rules == 0) {
				@class = split(' ',$in);
				shift(@class);
				$classkey = shift(@class);
				$class{$classkey} = join(' ',@class);
				next;
			} else {
				print STDERR "CLASS is only allowed before RULES\n";
				return(0);
			}
		} elsif($in =~ /^STYLE/) {
			if($reading_rules == 0) {
				@style = split(' ',$in);
				$style = _conv_style($style[1]);
				if($style ne '') {
					$me->{STYLE} = $style;
					next;
				} else {
					print STDERR "Style value not recognised.\n";
				}
			} else {
				print STDERR "STYLE is only allowed before RULES\n";
				return(0);
			}
		} elsif($in =~ /^DIRECTION/) {
			if($reading_rules == 0) {
				@direction = split(' ',$in);
				$direction = _conv_dirn($direction[1]);
				if($direction ne '') {
					$me->{DIRN} = $direction;
					next;
				} else {
					print STDERR "Direction value not recognised.\n";
				}
			} else {
				print STDERR "DIRECTION is only allowed before RULES\n";
				return(0);
			}
		} elsif($reading_rules == 1) {
			foreach $celem (keys(%class)) {
				$in =~ s/$celem(.*)\[\[/$class{$celem}$1\[\[/g;
				$in =~ s/\]\](.*)$celem(.*)\-\>/\]\]$1$class{$celem}$2\-\>/g;
			}
			$in2 = $me->_compactify_rules($in);
			@bits = _add_defaults($in2);
			$n++;
			# using $headletter cuts down the rules to be searched
			if($bits[0] ne '') {
				$headletter = $bits[0];
				$headletter =~ s/^(.).*/$1/;
			} else {
				$headletter = '';
			}
			push(@{$rulebase{$headletter}},[ @bits ]);
		}
	}
	$reading_rules = 0;
	close(RULES);
	
	if($me->{DEBUG} != 1) {
		print STDERR "Finished reading $n rules, starting processing of input\n";
	}

	$me->{REVSYMTAB} = { reverse(%{$me->{SYMTAB}}) };
	$me->{RULES} = %rulebase;
	$me->{NRULES} = $n;

	return(1);
}
	
sub _find_match {
	my $me = shift;
	my $word = shift;

	my $key;
	my $w;
	my $t;
	my $targ;
	my $targ2;
	my $lc;
	my $rc;
	my $out;
	my $preword = '';

	WHLOOP:	
	while($word ne '') {
		foreach $key (@{$rulebase{''}}) {	# for syll bds etc
			($targ,$lc,$rc,$out) = @{$key};
			if(($preword =~ /$lc$/) && ($word =~/^$rc/)) {
				$t = "$t $out";
				last;
			}
		}
		$hl = $word;
		$hl =~ s/^(.).*/$1/;
		foreach $key (@{$rulebase{$hl}}) {
			($targ,$lc,$rc,$out) = @{$key};
			if(($preword =~ /$lc$/) && ($word =~/^($targ)($rc.*)/)) {
				$preword .= $1;
				$word = $2;
				$targ2 = $me->_decompactify_output($targ);
				$w = "$w $targ2";
				$t = "$t $out";
				next WHLOOP;
			}
		}
		if($word =~/^(.)(.*)/) {
			$preword .= $1;
			$word = $2;
			$targ2 = $me->_decompactify_output($1);
			$w = "$w $targ2";
			$t = "$t <unmatched>";
			next WHLOOP;
		}
	}
	return(($w,$t));
}

	
sub _conv_dirn {
	my $look = "\L$_[0]\E";
	my $dirn = '';

	if(($look =~ /^lefttoright$/) || ($look =~ /^ltor$/)) {
		$dirn = "ltor";
	} elsif(($look =~ /^righttoleft$/) || ($look =~ /^rtol$/)) {
		$dirn = "rtol";
	} else {
		print STDERR "Cannot recognise direction\n";
		print STDERR "Default is right to left\n";
	}
	return($dirn);
}

sub _conv_style {
	my $look = "\L$_[0]\E";
	my $style = '';
	
	if($look =~ /^compact$/) {
		$style = "compact";
	} elsif($look =~ /^expanded$/) {
		$style = "expanded";
	} else {
		print STDERR "Cannot recognise style\n";
		print STDERR "Default is compact (conventional REs)\n";
	}
	return($style);
}

sub _add_defaults {
	my $rule = $_[0];
	my $lc;
	my $targ;
	my $rc;
	my $out;

	### this SHOULD now work for null targets too.
	### need some way of flagging duff input

	### There must be a better way

	if($rule =~ /^\s*((\S+)\s+)?\[\[\s+((\S+)\s+)?\]\]\s+((\S+)\s+)?\-\>(.*)/) {
		$lc = $2;
		$targ = $4;
		$rc = $6;
		$out = join(' ',split(' ',$7));

		return(($targ,$lc,$rc,$out));
	}
}

sub _compactify_rules {
	my $me = shift;
	my $in = shift;
	my $el;
	my $out;
	my $now_all;
	
	if($style eq "expanded") {
	
		@in = split(' ',$in);
	
		$now_all = 0;
		$num = shift(@in);
	
		foreach $el (@in) {
			if($el eq "[[") {
				$el = " [[ "; 
			} elsif($el eq "]]") {
				$el = " ]] "; 
			} elsif($el eq "->") {
				$el = " -> "; 
				$now_all = 1;
			} elsif($now_all == 0) {
				$el = $me->_single_compactify($el);
			} elsif($now_all == 1) {
				$el = " " . $el . " ";
			}
		}
		$out = join('',@in);
		$out = "$num " . $out;
		return($out);
	}
	else {
		return($in);
	}
}

sub _compactify_input {
	my $me = shift;
	my $in = shift;
	my $el;
	my $out;

	if($me->{STYLE} eq "compact") {
		return($in);
	} else {
		@in = split(' ',$in);
	
		foreach $el (@in) {
			$el = $me->_single_compactify($el);
		}
		$out = join('',@in);
		return($out);
	}
}

sub _single_compactify {
	my $me = shift;
	my $inp = shift;

	if(defined($me->{SYMTAB}->{$inp})) {
		return($me->{SYMTAB}->{$inp});
	} elsif($reading_rules == 1) {
		$me->{SYMTAB}->{$inp} = shift(@acceptable_symbol_table_chars_are);
		if($me->{SYMTAB}->{$inp} ne '') {
			return($me->{SYMTAB}->{$inp});
		} else {
			Carp::croak("Predefined symbol table size insufficient\n");
		}
	} else {
		print STDERR "Symbol $inp not found in symbol table\n";
		return('');
	}
}

sub _decompactify_output {
	my $me = shift;
	my $targ = shift;

	if($me->{STYLE} eq "expanded" ) {
		@ebits = split('',$targ);
		foreach $ebit (@ebits) {
			if(defined($me->{REVSYMTAB}->{$ebit})) {
				$ebit = $me->{REVSYMTAB}->{$ebit};
			} else {
				$ebit = "<invalid>";
			}
		}
		$targ2 = join(' ',@ebits);
	} else {
		$targ2 = $targ;
	}
	return($targ2);
}

1;
