# Tokenization program by Gregory Grefenstette. 
# In Hans Van Halteren ed., Syntactic Wordclass Tagging, (Text, Speech & Language Technology S.) 
# Kluwer Academic Publishers 1999.
# Translated from nawk into Perl by Pierre Nugues

use utf8;
binmode(STDOUT, ":encoding(UTF-8)");
binmode(STDIN, ":encoding(UTF-8)");

$letter = "[A-Za-z']";
$letter_accented = "[ÅÄÖåäö]";
$not_letter = "[^A-Za-z'0-9]";
$always_sep = "[\?!()\";/\\|,`]";
$begin_sep = "['&]";
$end_sep = "('|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't)";
$abbr{"Co."} = 1; $abbr{"Corp."} = 1; $abbr{"vs."} = 1; $abbr{"e.g."} = 1;
$abbr{"etc."} = 1; $abbr{"ex."} = 1; $abbr{"cf."} = 1; $abbr{"eg."} = 1;

$abbr{"Jan."} = 1; $abbr{"Feb."} = 1; $abbr{"Mar."} = 1; $abbr{"Apr."} = 1; 
$abbr{"Jun."} = 1; $abbr{"Jul."} = 1; $abbr{"Aug."} = 1; $abbr{"Sept."} = 1;
$abbr{"Oct."} = 1; $abbr{"Nov."} = 1; $abbr{"Dec."} = 1;
$abbr{"jan."} = 1; $abbr{"feb."} = 1; $abbr{"mar."} = 1; $abbr{"apr."} = 1; 
$abbr{"jun."} = 1; $abbr{"jul."} = 1; $abbr{"aug."} = 1; $abbr{"sept."} = 1;
$abbr{"oct."} = 1; $abbr{"nov."} = 1; $abbr{"dec."} = 1;

$abbr{"ed."} = 1; $abbr{"eds."} = 1; $abbr{"repr."} = 1; $abbr{"trans."} = 1; 
$abbr{"vol."} = 1; $abbr{"vols."} = 1; $abbr{"rev."} = 1; $abbr{"est."} = 1; 
$abbr{"b."} = 1; $abbr{"m."} = 1; $abbr{"bur."} = 1; $abbr{"d."} = 1; $abbr{"r."} = 1;
$abbr{"M."} = 1; $abbr{"Dept."} = 1; $abbr{"MM."} = 1; $abbr{"U."} = 1;
$abbr{"Mr."} = 1; $abbr{"Jr."} = 1; $abbr{"Ms."} = 1; $abbr{"Mme."} = 1; $abbr{"Mrs."} = 1;
$abbr{"Dr."} = 1;

while (<>) {
  # This line changes tabs into spaces
  s/\t/ /g;
  # put blanks around characters that are unambiguous separators
  s/$always_sep/ $& /g;
  # if a word is a separator in the beginning of a token separate it here
  s/^$begin_sep/$& /g;
  s/ $begin_sep/$& /g;
  s/($not_letter)($begin_sep)/$1 $2/g;
  # idem for final separators
  s/$end_sep\s/ $&/g;
  s/$end_sep($not_letter)/$1 $2/g; # the end separator is already between parentheses and is stored in $1
  
  # This line divides the input line and assigns it to elements of an array
  @all_words = split(' ', $_);
  # We examine all the elements
  foreach $word (@all_words) {
  	# if it contains a letter followed by a period,
    if ($word =~ /$letter\./) {
      # we see if it is an abbreviation 
      # if it is explicitly found in the abbreviation list
      if (!$abbr{$word}) {
        # or matches the regular expression below, we keep the period attached (possible acronyms)
        if (!($word =~ /^([A-Za-z]\.([A-Za-z]\.)+|[A-Z][bcdfghj-np-tvxz]+\.)$/)) {
          # if not, a space is inserted before the period
          $word =~ s/\.$/ \./;
        }
      }
    }
    # Change all spaces to new lines
    $word =~ s/[ \t]+/\n/g;
    # Print the current word
    print $word, "\n";
  }
}
