#!/usr/bin/perl -w


use strict;

use vars qw (%Options);
use Getopt::Long;


###############################################################################
# created: 2005-04-07 Eva Forsbom evafo@stp.ling.uu.se
#
my $usage = "\n\nUsage: $0 [-h|--help] <DATA>\n\n".
    "Description:  This Perl script makes 10 different selections of 10 divisions from the data file.\n".
    "Output: 10 different 10-fold divisions.\n".
    "\n".
    "Required arguments:\n".
    "  <DATA>  where DATA is the output from the commands\n".
    "\n".
    "  xsltproc xces2r.xsl <SUC.xml>+ ;\n".
    "  wc -l <SUC.dat>+ | grep -v total\n".
    "\n".
    "  and <SUC.xml>+ are any number of XML-formatted SUC files,\n".
    "  and <SUC.dat>+ are any number of data files from the previous command.\n".
    "\n".
    "Optional arguments:\n".
    "  -h|--help  prints this help message to STDOUT\n".
    "\n";
#
#
###############################################################################

# Get options
GetOptions (\%Options, "h|help", "c|columns");

# Print help
if (exists $Options{"h"} and $Options{"h"}) {
    print STDERR $usage;
    exit;
}


# SUC texts with (sub)categories and number of tokens (to pick from)
my %suc;
# Number-name index for looking up random texts
my @suc_map;

# Total number of tokens
my $total_tokens = 0;


# Process the data file and store information
while(<>){
    chomp;

    # remove leading white space
    s/^[\s]*//;

    # get number of tokens and SUC text
    my ($tokens, $text) = split /\s/;

    # store the info as selectable
    $suc{$text} = $tokens;

    # store the name in an index
    push @suc_map, $text;

    # update token count
    $total_tokens += $tokens;
}


# Get text count (for looking up random texts)
my $text_index = @suc_map;


# Compute division size
my $division_size = $total_tokens / 10;


# Make 10 different selections to use for cross-validation
foreach my $seed (1..10){

    # print to report
    print "*** Start seed $seed ***\n";

    # set a new seed for randomising
    srand(time);

    # in the beginning all texts are selectable
    my %selectables = %suc;

    # 9 randomised SUC divisions
    # (the tenth is the residual of selectable SUC texts)
    my %divisions;

    # divide the texts into 10 approximately equally-sized divisions
    foreach my $division (1..9) {

	# print to report
	print "***** Start division $division *****\n";

	# start token count
	my $token_count = 0;

	# set maximum deviation size (~1000 tokens)
	my $max_dev = $division_size / 100;

	# select texts until division is full
	my $index = int(rand($text_index));

	while ($token_count < ($division_size + $max_dev)) {
	    if (exists $selectables{$suc_map[$index]}) {
		# variable sugar ;)
		my $n = $selectables{$suc_map[$index]};

		# division size not reached? add text
		if (($n + $token_count) < ($division_size + $max_dev)) {
		    # update count
		    $token_count += $n;

		    # "move" text from selectables to division
		    print "******* Text $suc_map[$index], total $n *******\n";
		    $divisions{$division} .= "$suc_map[$index]\t"; 
		    delete $selectables{$suc_map[$index]};

		    # division size reached?
		    if ($token_count > $division_size) {
			last;
		    }
		}

		# division size reached?
		else {
		    last;
		}
	    }

	    # select another text
	    $index = int(rand($text_index));

	}

	# remove trailing tab
#	$divisions{$division} =~ s/[\t]$//; 

	# print to report
	print "***** End division $division, total $token_count *****\n";

    }


    # print the tenth division  
    # print to report
    print "***** Start division 10 *****\n";

    # start token count
    my $token_count = 0;

    # process residuals
    foreach my $text (keys %selectables) {
	print "******* Text $text, total $selectables{$text} *******\n";
	$token_count += $selectables{$text};
    }

    # print to report
    print "***** End division 10, total $token_count *****\n";


    # print to report
    print "*** End seed $seed, total $total_tokens ***\n";

}


