#!/usr/bin/perl -w

use strict;

###############################################################
# created: 2003-05-28 Eva Forsbom evafo@stp.ling.uu.se
#
my $usage = "\n\nUsage: $0 [-h -n] -r <REGEX-FILE> <STRING-FILE>\n\n".
    "Description:  This Perl script checks if a string is covered by any of the Perl regular expression submitted.\n".
    "Input:        A tab-separated list of \"string\" and any other field to be displayed (STRING-FILE)\n".
    "Output:       A tab-separated list of \"string\" and any regular expressions covering it\n".
    "\n".
    "Required arguments:\n".
    "  -r <REGEX-FILE>    Name of file with regular expressions and any other fields to be displayed\n".
    "Optional arguments:\n".
    "  -h    Print this help message to STDOUT\n".
    "  -n    Print the strings that are not covered by any regular expression\n\n".
    "License:\n".
    "You may use or modify this script as you like, but do it at your own risk.\n".
    "\n".
    "\n";


# Examples:

# Case 1: Only strings and regular expressions (no other fields):

# Regular expressions:
# http://stp.ling.uu.se/~evafo/software/regex.dat

# Strings to check
# http://stp.ling.uu.se/~evafo/software/strings.dat

# Check if the strings are covered by any regular expression (print the ones that are):
# regex.pl -r regex.dat strings.dat

# Check if the strings are covered by any regular expression (print the ones that are not):
# regex.pl -n -r regex.dat strings.dat

# Case 2: Strings and regular expressions with other information (field(s)):

# Regular expressions:
# http://stp.ling.uu.se/~evafo/software/regexF.dat

# Strings to check
# http://stp.ling.uu.se/~evafo/software/stringsF.dat

# Check if the strings (with information) are covered by any regular expression (with information) (print the ones that are):
# regex.pl -r regexF.dat stringsF.dat

# Check if the strings (with information) are covered by any regular expression (with information) (print the ones that are not):
# regex.pl -n -r regexF.dat stringsF.dat

###############################################################


# For options
use vars qw (%Options);
use Getopt::Long;


# Get options
GetOptions (\%Options, "h", "n", "r=s");

# Print help
if (exists $Options{"h"} and $Options{"h"}) {
    print STDERR $usage;
    exit;
}

# Check required options
unless (exists $Options{"r"} and $Options{"r"}) {
    print STDERR $usage;
    exit;
}


open RE, $Options{"r"} or die "Cannot open regular expressions file ".$Options{"r"}.":$!\n\n";

my %re;

# Collect and store the regular expressions
while (<RE>) {
    chomp;
    (my $r, my @rest) = split /\t/, $_;
    
    # remove filling spaces
    $r =~ s/([.]*)\s*$/$1/g;
    
    $re{$r} = [@rest];
    
}

close RE;


# Check if word form is covered by any regular expressions
while (<>) {
    chomp;
    (my $word, my @rest) = split /\t/, $_;
    my $printline = $_;
    my $covered = 0;
    
    # remove filling spaces
    $word =~ s/([.]*)\s*$/$1/g;
    
    foreach my $r (keys %re) {
	if (defined $Options{"n"} and $word !~ /^$r$/) {
	    next;
	} 
	elsif (defined $Options{"n"} and $word =~ /^$r$/) {
	    $covered = 1;
	    last;
	} 
	elsif ($word =~ /^$r$/) {
	    $printline .= "\t$r";	
	    foreach my $i (@{$re{$r}}) {
		$printline .= "\t$i";	
	    }
	}
    }
    # print covered word forms
    if ($printline ne $_) {
	print $printline."\n";	
    }
    # print not covered word forms
    elsif (defined $Options{"n"} and not ($covered)) {
	print $printline."\n";	
    }
}



