#!/usr/bin/perl -w

use strict;

use vars qw (%Options);
use Getopt::Long;


###############################################################################
# created: 2004-09-04 Eva Forsbom evafo@stp.ling.uu.se
#
my $usage = "\n\nUsage: $0 [-h|--help] <SUC2.0-PAROLE-FILE(s)>\n\n".
    "Description:  This Perl script converts SUC2.0-PAROLE file(s) into XML compliant file(s)).\n".
    "\n".
    "Optional arguments:\n".
    "  -h|--help  prints this help message to STDOUT\n".
    "\n\n".
    "Additional info:\n".
    "You probably have to reset the file name for the dbcentx.mod package,\n".
    "available from http://www.oasis-open.org/docbook/xml/4.2/,\n".
    "in the \$dbcentx variable.\n".
    "\n\n".
    "License:\n".
    "You may use or modify this script as you like, but do it at your own risk.\n".
    "\n";
#
#
# version 0.011
# 2005-01-01: Corrected a bug that put the end tag for 'div' in the wrong place.
###############################################################################

# Get options
GetOptions (\%Options, "h|help");

# Print help
if (exists $Options{"h"} and $Options{"h"}) {
    print STDERR $usage;
    exit;
}

my $dbcentx ="/usr/share/sgml/docbook/xml-dtd-4.1.2-1.0-26/dbcentx.mod";


### Sub routines used
sub xmlify;
sub entity;


# keep track of tag hierarki (used for unclosed tags)
my @stack = ();


# go through files
foreach my $file (@ARGV) {
    my $out = $file;
    # split full path into directory, filename, and extension
    # use filename, and replace any extension with 'xml'
    $out =~ s/(.*\/)?([^\/\.]+)(\.?.*)$/$2.xml/;
    my $filename = $2;

    open IN, "<$file";
    open OUT, ">$out";

    # include standard entity definitions
    print OUT '<?xml version="1.0" encoding="iso-8859-1"?>', "\n\n",
    "<!DOCTYPE tei.2 [\n\n",
     "<!ENTITY % dbcent PUBLIC
     \"-//OASIS//ENTITIES DocBook XML Character Entities V4.
1.2//EN\"
     \"$dbcentx\">\n",
     "%dbcent;\n",
   "]>\n\n";


    # keep track of tag hierarki (used for unclosed tags)
    @stack = ();


    # assuming no tag has split-line content
    while (<IN>) {
	print OUT xmlify($_);
    }

    close IN;
    close OUT;
}


#############################################################################
# Sub routines

# not used
# Convert reserved XML characters to entities
sub entity {
    my $text = shift;

    # contain any reserved characters?
    if ($text =~ /[&]/) {
	# replace
	$text =~ s/&/&amp;/g;
    }


    # contain any reserved characters?
    if ($text =~ /[<>\"\']/) {
	# replace
	# why does it replace & after the first replacement???
	$text =~ s/</&lt;/g;
	$text =~ s/>/&gt;/g;
	$text =~ s/\"/&quot;/g;
	$text =~ s/\'/&apos;/g;
    }


    return $text;
}


# Convert to proper XML
sub xmlify {
    
    # replace single & in attribute
    s/lem=[\"\']&[\"\']/lem=\"&amp;\"/;

    # replace single & in content
    s/&([< ])/&amp;$1/;

    # take care of special cases
    s/1&3/1&amp;3/g;
    s/&([-_])/&amp;$1/g;

    # replace single < in attribute
    s/lem=[\"\']<[\"\']/lem=\"&lt;\"/;

    # replace single < in content
    s/<([< ])/&lt;$1/;

    # take care of special cases
    s/<([=-])/&lt;$1/g;

    # replace single > in attribute
    s/lem=[\"\']>[\"\']/lem=\"&gt;\"/;

    # replace single > in content
    s/([> ])>/$1&gt;/;

    # take care of special cases
    s/->/-&gt;/g;

    # remember = in content
    s/>(.*)=(.*)</>$1&eq;$2</;

    # add quotes around attribute values if missing
    s/=([^\"\'][^ >]*)/=\"$1\"/g;

    # put back = in content
    s/&eq;/=/;


    # check for unclosed and empty tags (div, catRef)
    # remember opening tag
    if (/<([[:alpha:]][^ \/>]*)/) {
	my $tag = $1;

	# check for unclosed tags (div)
	if ($tag eq "div") {
	    my $lasttag = pop @stack;

	    # check for unclosed tags at the same tag level
	    if ($lasttag eq "div") {
		s/^/<\/$lasttag>\n/;
	    }
	    # none, so restore to previous state
	    else {
		push @stack, $lasttag;
	    }
	    push @stack, $tag;
	    return $_;
	}

	push @stack, $tag;

	# check for empty tags (catRef)
	if ($tag eq "catRef") {
	    pop @stack;
 	    s/>$/\/>/;
	    return $_;
	}
    }
    # synchronise opening and ending tags
    if (/<\/([[:alpha:]][^ \/>]*)/) {
	my $tag = $1;
	my $lasttag = pop @stack;
	until ($tag eq $lasttag) {
#	    print STDERR "TAG:$tag,LASTTAG:$lasttag\n";
	    s/^/<\/$lasttag>\n/;
	    $lasttag = pop @stack;
	}
    }

    return $_;

}



