#!/usr/bin/perl
# vim:sw=4 sta showmatch

# texi_xml - convert Texi-XML to Texinfo
#            (See DocBook documentation for details)
#
# Copyright (C) 2000 Steve Cheng <steve@ggi-project.org>
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# 
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING.  If not, please write
# to the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
# USA.
# 

use strict;

package texixml;

use XML::Parser::PerlSAX;
use XML::SGMLSpl;

my $sgmlspl = new XML::SGMLSpl;

##################################################
#
# Output routines
#
##################################################

sub output {
    my $text = shift;

    if($text =~ s/^\n//) {
	# Force newline if needed
	print OUT "\n" unless $texixml::newline_last++;
    }
    return if $text eq '';
    
    print OUT $text;

    $texixml::newline_last = ($text =~ /\n$/);
}

# CHECKME: is this really needed?
#
sub trim_string
{
    my $name = shift;
    for ($name) {
	tr/ \t\n/ /s;
	s/^ +//mg;
	s/ +$//mg;
    }
    return $name;
}

sub texi_arg_escape
{
    my $s = shift;
    $s =~ tr/,/./;
    return $s;
}

# Escape Texinfo syntax chars
#
sub texi_escape
{
    my $s = shift;
    $s =~ s/([\@\{\}])/\@$1/g;
    return $s;
}

# Allows output to be saved when $texixml::output_save is set,
# otherwise same as output.
#
# Savable output does not stack, unlike SGMLS::Output.
#
# When saving output, this function does not have any line-breaking 
# semantics of regular output() and will NOT work with block-level
# elements. (Saving of arbitrary output has many other complications
# which we will not attempt solve here to solve in the interests of 
# minimizing code bloat.)
#
sub savable_output
{
    if(defined $texixml::output_save) {
	$texixml::output_save .= shift;
    } else {
	&output;
    }
}
	
    
##################################################
#
# A clean solution to the extra-newlines problem
#
##################################################

# texi_xml keeps track of what type of element (block, inline 
# or neither) it just processed, then makes line breaks for
# the next element only if necessary
#
# These functions are passed the *parent* element,
# i.e. the container element of the element possibly needing
# a break.
#
sub block_break
{
    if(defined $texixml::output_save) {
	die "block_break called while saving output";
    }

    my $lastchild = shift->ext->{'lastchild'};
    output "\n\n" if $lastchild eq 'inline' || $lastchild eq 'block';
}

sub inline_break
{
    if(defined $texixml::output_save) {
	return;
    }
    
    my $lastchild = shift->ext->{'lastchild'};

    # Example:
    # <para>Warning<para>Some text</para>Do not indent this text
    # since it's part of the same paragraph</para>

    output "\n\n\@noindent\n" if $lastchild eq 'block';
}



##################################################
#
# Main document
#
##################################################

$sgmlspl->{start_element}->{'texinfo'} = sub {
    my ($elem, $sgmlspl) = @_;

    if(defined $elem->attribute('file')) {
	$texixml::file = $elem->attribute('file');
    } elsif($texixml::inputfile ne '-') {
	$texixml::file = $texixml::inputfile;
	$texixml::file =~ s/\.txml$//;
    } else {
	$texixml::file = 'noname';
    }

    open(OUT, ">${texixml::file}.texi");

    $texixml::newline_last = 1;
    output "\\input texinfo\n";
    output "\n\@setfilename ${texixml::file}.info\n";

    # expat feeds us utf-8, whether Texinfo likes it or not
    output "\@documentencoding utf-8\n";
};

$sgmlspl->{end_element}->{'texinfo'} = sub {
    output "\n\n\@bye\n";
    close(OUT);
};


##################################################
#
# Menus, nodes
#
##################################################

$sgmlspl->{start_element}->{'node'} = sub {
    my ($elem, $sgmlspl) = @_;
    my $node = node_escape(texi_escape($elem->attribute('name')));
    output "\n\n\@node $node\n";
};

$sgmlspl->{start_element}->{'menu'} = sub {
    my ($elem, $sgmlspl) = @_;
    
    $elem->ext->{'outputmode'} = 'strip-newline';
    $elem->parent->ext->{'lastchild'} = 'block';
    output "\n\n\@menu\n";
};
$sgmlspl->{end_element}->{'menu'} = sub {
    my ($elem, $sgmlspl) = @_;
    output "\n\@end menu\n";
};
$sgmlspl->{start_element}->{'detailmenu'} = sub {
    my ($elem, $sgmlspl) = @_;
    output "\n\n\@detailmenu\n";
};
$sgmlspl->{end_element}->{'detailmenu'} = sub {
    my ($elem, $sgmlspl) = @_;
    output "\n\@end detailmenu\n";
};

$sgmlspl->{start_element}->{'menuitem'} = sub {
    my ($elem, $sgmlspl) = @_;
    $elem->ext->{'outputmode'} = 'strip-newline';
    $texixml::output_save = '';
    
    $elem->parent->ext->{'lastchild'} = 'inline';
};

$sgmlspl->{end_element}->{'menuitem'} = sub {
    my ($elem, $sgmlspl) = @_;

    my $entry = $texixml::output_save;
    $texixml::output_save = undef;

    # Although the menu entry is not constrained to the set
    # of characters allowed for node names, the use of ':'
    # to separate the parts of menu entry implies that it
    # is not an allowed character.
    $entry = node_escape($entry);
    
    my $node = node_escape(texi_escape($elem->attribute('node')));
    if($entry eq $node) {
	output "\n* ${entry}::\t";
    } else {
	output "\n* ${entry}: ${node}.\t";
    }
};

# Do escaping for nodenames:
# NOTE: stylesheets should do this if possible
# since there can be rare name clashes.
sub node_escape
{
    my $name = shift;
    for ($name) {
	tr/().,:'/[];;;_/;
	tr/ \t\n/ /s;
	s/^ +//mg;
	s/ +$//mg;
    }
    return $name;
}

##################################################
#
# Info directory
#
##################################################

$sgmlspl->{start_element}->{'dircategory'} = sub {
    my ($elem, $sgmlspl) = @_;
    output "\n\@dircategory ";
    
    $elem->ext->{'outputmode'} = 'strip-newline';
};
$sgmlspl->{end_element}->{'dircategory'} = sub {
    output "\n";
};

$sgmlspl->{start_element}->{'direntry'} = sub {
    output "\n\@direntry\n";
};
$sgmlspl->{end_element}->{'direntry'} = sub {
    output "\n\@end direntry\n";
};

 
##################################################
#
# Internationalization
#
##################################################

# Allowing a common lang attribute on all elements 
# would really help XML applications...

$sgmlspl->{start_element}->{'documentlanguage'} = sub {
    my ($elem, $sgmlspl) = @_;
    output "\n\@documentlanguage " . $elem->attribute('lang') . "\n";
};

##################################################
#
# Sectioning elements
#
##################################################

sub section_start_handler {
    my ($elem, $sgmlspl) = @_;
    
    $elem->parent->ext->{'lastchild'} = '';
    output "\n\@" . $elem->name . ' ';
    
    $elem->ext->{'outputmode'} = 'strip-newline';
}

sub section_end_handler {
    output "\n";
}

foreach my $gi
    (qw(chapter section subsection subsubsection
       majorheading chapheading heading subheading subsubheading
       top unnumbered unnumberedsec unnumberedsubsec unnumberedsubsubsec
       appendix appendixsec appendixsubsec appendixsubsubsec)) 
{
    $sgmlspl->{start_element}->{$gi} = \&section_start_handler;
    $sgmlspl->{end_element}->{$gi} = \&section_end_handler;
}





##################################################
#
# Paragraph
#
##################################################

$sgmlspl->{start_element}->{'para'} = sub {
    &block_start_common_handler;
};
$sgmlspl->{end_element}->{'para'} = sub {
    &block_end_common_handler;
};
    
sub block_start_common_handler {
    my ($elem) = @_;
    block_break($elem->parent);
    $elem->parent->ext->{'lastchild'} = 'block';
}
sub block_end_common_handler {
    my ($elem) = @_;
    output "\n";
}


##################################################
#
# Verbatim displays
#
##################################################

sub verbatim_block_start_handler 
{
    &block_start_common_handler;
    my ($elem, $sgmlspl) = @_;
    output '@' . $elem->name . "\n";
    $elem->ext->{'outputmode'} = 'preserve';
}
sub verbatim_block_end_handler
{
    &block_end_common_handler;
    my ($elem, $sgmlspl) = @_;
    output "\n\@end " . $elem->name . "\n";
}

foreach my $gi
    (qw(example display format)) {
    $sgmlspl->{start_element}->{$gi} = \&verbatim_block_start_handler;
    $sgmlspl->{end_element}->{$gi} = \&verbatim_block_end_handler;
}

$sgmlspl->{start_element}->{'quotation'} = sub {
    &block_start_common_handler;
    my ($elem, $sgmlspl) = @_;
    output "\@quotation\n";
};

$sgmlspl->{end_element}->{'quotation'} = sub {
    &block_end_common_handler;
    output "\n\@end quotation\n";
};



##################################################
#
# Lists
#
##################################################

$sgmlspl->{start_element}->{'enumerate'} = sub {
    &block_start_common_handler;
    my ($elem, $sgmlspl) = @_;
    output "\@enumerate " . $elem->attribute('begin') . "\n";
};

$sgmlspl->{end_element}->{'enumerate'} = sub {
    &block_end_common_handler;
    output "\n\@end enumerate\n";
};

$sgmlspl->{start_element}->{'itemize'} = sub {
    &block_start_common_handler;
    my ($elem, $sgmlspl) = @_;
    
    # FIXME: mark attribute is defined to take values that are the names of 
    # glyphs in Texinfo, which simplifies stylesheets for many document
    # types such as HTML and DocBook.  However, it may be generally more
    # useful if this attribute (or another one) accepted actual
    # characters for the mark instead.
    if($elem->attribute('mark') ne '') {
	output "\@itemize \@" . texi_escape($elem->attribute('mark')) . "\n";
    } else {
	output "\@itemize\n";
    }
};

$sgmlspl->{end_element}->{'itemize'} = sub {
    &block_end_common_handler;
    my ($elem, $sgmlspl) = @_;
    output "\n\@end itemize\n";
};

$sgmlspl->{start_element}->{'table'} = sub {
    &block_start_common_handler;
    my ($elem, $sgmlspl) = @_;
    output "\n\@table \@asis\n";
};
$sgmlspl->{end_element}->{'table'} = sub {
    &block_end_common_handler;
    output "\n\@end table\n";
};

$sgmlspl->{start_element}->{'item'} = sub {
    my ($elem, $sgmlspl) = @_;
    
    block_break($elem->parent);
    $elem->parent->ext->{'lastchild'} = '';
    output "\n\@item ";
    
    $elem->ext->{'outputmode'} = 'strip-newline';
};
$sgmlspl->{end_element}->{'item'} = sub {
    output "\n";
};
$sgmlspl->{start_element}->{'itemx'} = sub {
    my ($elem, $sgmlspl) = @_;
    
    $elem->parent->ext->{'lastchild'} = '';
    output "\n\@itemx ";
    
    $elem->ext->{'outputmode'} = 'strip-newline';
};
$sgmlspl->{end_element}->{'itemx'} = sub {
    output "\n";
};

$sgmlspl->{start_element}->{'multitable'} = sub {
    &block_start_common_handler;
    my ($elem, $sgmlspl) = @_;
    # FIXME Support prototype attr
    output "\n\@multitable \@columnfractions " . 
	trim_string($elem->attribute('columnfractions')) . "\n";
};
$sgmlspl->{end_element}->{'multitable'} = sub {
    &block_end_common_handler;
    output "\n\@end multitable\n";
};

$sgmlspl->{start_element}->{'tab'} = sub {
    output "\n\@tab ";
};
    

##################################################
#
# Spacing
#
##################################################

$sgmlspl->{start_element}->{'sp'} = sub {
    my ($elem, $sgmlspl) = @_;
    output "\n\@sp " . $elem->attribute('n') . "\n";
};
$sgmlspl->{start_element}->{'page'} = sub {
    output "\n\@page\n";
};



##################################################
#
# Inline elements
#
##################################################

sub inline_start_handler {
    my ($elem, $sgmlspl) = @_;
    inline_break($elem->parent);
    savable_output '@'. $elem->name . '{';

# My initial impression was that nested inlines are not allowed
# in Texinfo, but the tools actually work correctly on them. Good!
#        unless $elem->parent->parent->ext->{'lastchild'} eq 'inline';

    $elem->parent->ext->{'lastchild'} = 'inline';
}
sub inline_end_handler {
    my ($elem, $sgmlspl) = @_;
    savable_output "}";
# Same reason as above.
#        unless $elem->parent->parent->ext->{'lastchild'} eq 'inline';
}

foreach my $gi
    (qw(code samp cite email dfn file sc acronym emph strong key kbd var i b r t math footnote)) {
    $sgmlspl->{start_element}->{$gi} = \&inline_start_handler;
    $sgmlspl->{end_element}->{$gi} = \&inline_end_handler;
}

$sgmlspl->{start_element}->{'anchor'} = sub {
    my ($elem, $sgmlspl) = @_;
    savable_output '@anchor{' . node_escape(texi_escape($elem->attribute('node'))) . '}';
    $elem->parent->ext->{'lastchild'} = 'inline';
};
    
##################################################
#
# Cross references, links
#
##################################################

sub crossref_start_handler {
    my ($elem, $sgmlspl) = @_;
    inline_break($elem->parent);
    
    $elem->ext->{'outputmode'} = 'strip-newline';
    $texixml::output_save = '';
    
    $elem->parent->ext->{'lastchild'} = 'inline';
}

sub crossref_end_handler {
    my ($elem, $sgmlspl) = @_;

    # Syntax:
    # @ref{$node,$infoname,$printname,$file,$printmanual}
    # node - required
    # infoname, printname - optionally specified by inline content 
    #     (I don't think there is much utility in having these separate)
    # infofile, printfile - optional
    
    my $node = node_escape(texi_escape($elem->attribute('node')));
    
    my $printname = texi_arg_escape($texixml::output_save);
    $texixml::output_save = undef;
    my $infoname;

    # If the node and cross reference name turn out to be
    # the same, make the latter empty so info won't display it
    # twice.
    $infoname = ($node eq $printname) ? '' : $printname;
    
    my $file = texi_escape($elem->attribute('file'));
    my $printmanual = texi_escape($elem->attribute('printmanual'));

    # Required part
    output '@' . $elem->name . '{' . $node;

    # Reference to another file
    if($file ne '' and $file ne $texixml::file) {
	output ",$infoname,$printname,$file,$printmanual}";
    }
    else {
	# No inline content either, so use one-argument @ref
	if($printname eq '') { output "}"; }
	else { output ",$infoname,$printname}"; }
    }
    
    # Texinfo's ugly hack
    output "." unless $elem->name eq 'pxref';
}

foreach my $gi (qw(xref ref pxref)) {
    $sgmlspl->{start_element}->{$gi} = \&crossref_start_handler;
    $sgmlspl->{end_element}->{$gi} = \&crossref_end_handler;
}



##################################################
#
# URI references
#
##################################################

$sgmlspl->{start_element}->{'uref'} = sub {
    my ($elem, $sgmlspl) = @_;
    inline_break($elem->parent);
    
    $elem->ext->{'outputmode'} = 'strip-newline';
    $texixml::output_save = '';
    
    $elem->parent->ext->{'lastchild'} = 'inline';
};

$sgmlspl->{end_element}->{'uref'} = sub {
    my ($elem, $sgmlspl) = @_;
    
    my $url = texi_escape($elem->attribute('url'));
    my $text = texi_arg_escape($texixml::output_save);
    $texixml::output_save = undef;
    
    if($text eq '') {
        output "\@uref{$url}";
    } else {
	output "\@uref{$url,$text}";
    }
};


##################################################
#
# Graphics
#
##################################################

$sgmlspl->{start_element}->{'image'} = sub {
    my ($elem, $sgmlspl) = @_;
    
    # FIXME Should we resolve URIs?
    my $filename = texi_escape($elem->attribute('filename'));
    my $width = texi_escape($elem->attribute('width'));
    my $height = texi_escape($elem->attribute('height'));
    
    # Texinfo says @image can be used for inline and displayed images
    # ... I'm pleasantly surprised.
    inline_break($elem->parent);

    savable_output "\@image{$filename,$width,$height}";
    
    $elem->parent->ext->{'lastchild'} = 'inline';
};


##################################################
#
# Indices
#
##################################################

$sgmlspl->{start_element}->{'indexterm'} = sub {
    # CHECKME Texinfo does not appear to allow inline index terms.
    # Is it reasonable to disallow them from appearing inside inline
    # elements in the document type?

    my ($elem, $sgmlspl) = @_;

    inline_break($elem->parent);
    
    output "\n\@" . $elem->attribute('class') . 'index ';
    $elem->ext->{'outputmode'} = 'strip-newline';

    $elem->parent->ext->{'lastchild'} = 'inline';
};
$sgmlspl->{end_element}->{'indexterm'} = sub {
    output "\n";
};

$sgmlspl->{start_element}->{'printindex'} = sub {
    my ($elem, $sgmlspl) = @_;
    block_break($elem->parent);
    output "\n\@printindex " . $elem->attribute('class') . "\n";
    $elem->parent->ext->{'lastchild'} = 'block';
};



##################################################
#
# Character data
#
##################################################

$sgmlspl->{characters} = sub {
    my ($s, $elem, $sgmlspl) = @_;

    # Escape Texinfo syntax chars
    $s = texi_escape($s);

    # Output mode of '' does not override parent default modes
    # Since there is not much element nesting in Texi-XML, 
    # hopefully this search isn't too inefficient ...
    #
    my $outputmode;
    for (my $this = $elem; $this; $this = $this->parent) {
	$outputmode = $this->ext->{'outputmode'} and last;
    }
    
    if($outputmode eq '') {
	# Weed out extra whitespace
        $s =~ tr/ \t/ /s;
	
	# Collapse newlines
	$s =~ tr/\n//s;

        # No spaces at beginning of lines
        $s =~ s/^ // if $texixml::newline_last;
	$s =~ s/(\n \n?)+/\n/g;

	if($s =~ /[^\s]/) {
	    inline_break($elem);
	    $elem->ext->{'lastchild'} = 'inline';
	}
	
	output $s;
    }
     
    elsif($outputmode eq 'strip-newline') {
	# Newlines, die!
        $s =~ tr/ \t\n/ /s;

	# NOTE: output_save only implemented for strip-newline mode.
	if(defined $texixml::output_save) {
	    # No spaces at beginning of lines
	    $s =~ s/^ +// if $texixml::output_save eq '';
	    
	    $texixml::output_save .= $s;
	} else {
	    # No spaces at beginning of lines
	    $s =~ s/^ +// if $texixml::newline_last;

	    if($s ne '') {
		inline_break($elem);
		$elem->ext->{'lastchild'} = 'inline';
		output $s;
	    }
	}
    
    } elsif($outputmode eq 'preserve') {
	if($texixml::newline_last and $s =~ /^\n/) {
	    # Make another line anyway
	    output "\n\n";
	}

	inline_break($elem);
	$elem->ext->{'lastchild'} = 'inline';

	output $s;
    
    } else {
	die("Unknown output mode $outputmode\n");
    }

};
			     
##################################################
#
# Comments
#
##################################################

$sgmlspl->{comment} = sub {
    my ($s, $elem, $sgmlspl) = @_;

    $s =~ tr/\n/ /s;
    # This doesn't work inside inline markup, but hopefully no one does that
    output '@c ' . $s . "\n";
};




##################################################
#
# Texinfo processing instruction
#
##################################################

$sgmlspl->{processing_instruction} = sub {
    my ($target, $data, $sgmlspl) = @_;

    # Allow processing instructions to use newlines(?)
    $data =~ s/\&#xA;/\n/g;
    $data =~ s/\&#10;/\n/g;

    if($target eq 'texinfo') {
	output $data;
    }
};



##################################################
#
# Ignition
#
##################################################

unshift(@ARGV, '-') unless @ARGV;
my $parser = XML::Parser::PerlSAX->new(DocumentHandler => $sgmlspl);

foreach $texixml::inputfile (@ARGV)
{
    $parser->parse(Source => { SystemId => $texixml::inputfile });
}

