#!/usr/bin/perl
#-----------------------------------------------------------------------------
# prepi2h.pl
# Created Apr 2001: Avinash Chopde <avinash@aczoom.com> http://www.aczoom.com/
my $VERSION = "1.01";
# Last Modified: May 11, 2001
#-----------------------------------------------------------------------------
# Pre-process ITRANS encoded text files (like 1.s isb song files)
# Can handle non songs files too.
# Reads STDIN or files given as arguments, outputs new text to STDOUT
# Used for TTF in ITRANS - xdvng - it does not have punctuation, so need
# to escape all punctuation inside ITRANS text. All other fonts (itxbeng, etc)
# are ok - they have punctuation chars, so the punctuation need not be escaped.
# Note that this is a very approximate process - in some cases, the
# substitutions will be made to non-ITRANS text, and ITRANS text itself
# may get garbled. But in most cases, this should work to clean up HTML
# for Xdvng.
# ----------------------------------------------------------------------------
# xdvng does not have punctuation, so it has to be escaped (using ##..##, which
# gets processed by itrans itself).
# ----------------------------------------------------------------------------
# Assumes that #indian, #hindi, #sanskrit, #marathi all use Xdvng.
# Rest of the tags will not be modified.
# ----------------------------------------------------------------------------
#
# Special handling:
# % text comment lines need to sent unchanged from this filter.
# 
# Example input file:
# % 
# \startsong
# \stitle{jogii jabase tuu aayaa mere dvaare}%
# ....
# % 
# % Credits:  
# \printtitle
# #indian
# %
# jogii jabase tuu aayaa mere dvaare
# tuu to a.Nkhiyo.n me.n jaan\-e\-jii kii batiyaa.N
# % some comment here
# o jogii jabase \threedots
# ... 
# %
# #endindian
# \endsong
# %

#-------------------------------------------------------------------------
# This is the main subroutine that performs substitutions on the
# itrans encoded text.
# Changes here include:
#    punctuation, replaced/protected for Xdvng
# Add more substititions as required here

sub itxsubstitute {
    local($lword) = @_;
    # $lword is now a string that was found between #indian .. #endindian

    # remove any \'s left - this will be \-, \& etc
    $lword =~ s/\\(\W)/$1/g;

    # escape all punctuation (except for . ^ _ ~  --- ITRANS chars)
    # make sure to include all following space chars!
    $lword =~ s/([,;:@\(\)\[\]\/\$\-'`"+!?*=&%|]\s*)/##$1##/g;
    # _ {} # are valid ITRANS chars, so left unchanged

    # change . ^ ~ only if they are not part of ITRANS (^n ~n .n .C etc)
    $lword =~ s/([\.^~]+)([^iIcCnNdDtTrhaueo \t])/##$1##$2/g;
    $lword =~ s/([\.^~]+$)/##$1##/g;
    $lword =~ s/([\.^~]+\s+)/##$1##/g;

    # minimize extra ##'s created... not really needed, but makes
    # output less verbose if many punctuation characters are used in sequence.
    # $lword =~ s/## ##/ /g;
    # $lword =~ s/####//g;
    # $lword =~ s/###/## #/g; # three #'s result from: ##.###endhindi

    # HTML chars protections - only some handled. Maybe even this should
    # not be done - users can always enclose such constructs outside
    # of ITRANS text...
    $lword =~ s/&/&amp;/g;

    return $lword;
}

#-------------------------------------------------------------------------
# all ITRANS markers need to be listed here, so this program can get
# the shortmarkers ## handled correctly.

$TOGGLEXFORM = 1;
$NEEDSXFORM = 2;
$DONTXFORM = 3;

%STARTINDIAN = (
    "##", $TOGGLEXFORM,
    "#indian", $NEEDSXFORM,
    "#hindi", $NEEDSXFORM,
    "#marathi", $NEEDSXFORM,
    "#sanskrit", $NEEDSXFORM,
    "#telugu", $DONTXFORM,
    "#tamil", $DONTXFORM,
    "#kannada", $DONTXFORM,
    "#bengali", $DONTXFORM,
    "#gujarati", $DONTXFORM,
    "#gurmukhi", $DONTXFORM,
    "#roman", $DONTXFORM,
    );

%ENDINDIAN = (
    "##", $TOGGLEXFORM,
    "#endindian", $NEEDSXFORM,
    "#endhindi", $NEEDSXFORM,
    "#endmarathi", $NEEDSXFORM,
    "#endsanskrit", $NEEDSXFORM,
    "#endtelugu", $DONTXFORM,
    "#endtamil", $DONTXFORM,
    "#endkannada", $DONTXFORM,
    "#endbengali", $DONTXFORM,
    "#endgujarati", $DONTXFORM,
    "#endgurmukhi", $DONTXFORM,
    "#endroman", $DONTXFORM,
    );

%XFORMWORD = (
    "\\threedots", ". . .",
    );

#-------------------------------------------------------------------------
$xformyes = 0; # whether current word is inside xdvng ITRANS markers...
$prevmarker = "#indian";  # short marker ## toggles this on and off

while (<>) {
    chomp($line = $_);

    @xline = ();

    # trim space at end
    $line =~ s/\s+$//;
    # remove last char if it is %
    # $line =~ s/%+$//;

    # first split - main objective is to get words like:
    # #indian #<word> \threedots \<word> ## (also #### should be two ##'s)
    # % is an itrans comment character, ITRANS ignores everything
    # from a % to the end of line... \% is used to make it non-comment char
    @words = split /(##|#\w+|\\\w+|\\%|%)/, $line;

    $commentseen = 0; # single line comments % ... handled specially

    foreach $word (@words) {
	next if ! $word; # since the split /(..) results in empty words
	if (!$commentseen && $word =~ /^%/) {
	    # skip rest of words if this is a % line
	    $commentseen = 1;
	    $word =~ s/^%/##%/ if ($xformyes); 
	}
	if ($commentseen) {
	    push @xline, $word;
	    next;
	}

        # print "found word :$word:\n";

	# first check if the entire word can be replaced
	$xword = $XFORMWORD{$word};
	if ($xword ne "") {
	    push @xline, "##" if $xformyes;
	    push @xline, $xword;
	    push @xline, "##" if $xformyes;
	    next; # word was replaced, all done
	}

	$xword = $word;

	# determine xformyes state
	&determine_state($word);

	if ($xformyes) {

	    # call subroutine to make changes
	    $xword = &itxsubstitute($xword);

	    # print "    word xformed to:$xword\n" if $word ne $xword;
	} # if $xformyes

	push @xline, $xword;

    } # foreach $word, line consumed

    push @xline, "\n";

    if ($commentseen == 1) {
	# % is an itrans comment character, ITRANS ignores everything
	# from a % to the end of line...
	# % ... \n is all comments, so have to send ## on next line
	push @xline, "##" if ($xformyes);
    }

    $xline = join("", @xline);

    print $xline;

} # while (<>)

print "\n" if ($commentseen && $xformyes);

exit 0; # successful exit

# --------------------------

sub determine_state {
    ($word) = @_;
    my ($starti, $endi);

    $starti = $STARTINDIAN{$word};
    $endi = $ENDINDIAN{$word};

    if ($starti == $TOGGLEXFORM) {
	$xformyes = ! $xformyes if ($STARTINDIAN{$prevmarker} == $NEEDSXFORM);
    } elsif ($starti == $NEEDSXFORM) {
	$xformyes = 1;
	$prevmarker = $word;
    } elsif ($starti == $DONTXFORM) {
	$prevmarker = $word;
    } elsif (defined $endi) { # any end marker, other than ## seen
        # completes all start markers, so reset to #indian
	$xformyes = 0;
	$prevmarker = "#indian";
    }

    # print "   xformyes: $xformyes prevmarker: $prevmarker, word $word, starti=$starti\n" if $starti > 0 || $endi > 0;
}
#-------------------------------------------------------------------------
