#!/usr/bin/perl
# -*- coding: iso-8859-1 -*-
#
# pronouncible.pl
#
# Elias Schwerdtfeger, http://tamagothi.wordpress.com/
# 2009-05-25
#
# Try to apply some rules to make it pronouncible. The motivation is to
# make the differences between the two Currier "languages" audible, it is
# not a serious research. Of course, I am convinced that the VMs isn't
# written language. 
#
# But there are people who don't believe in the fact of the two different
# Currier "languages", because it is an abstract statistical result they
# don't understand. It is hard to see in a transcription file, so I want 
# to make it obvious by creating a pronouncible version.

use strict;
use warnings;

# ATTENTION!
#
# I use an environment variable VOYNICH, which points to 
# the interlinear transcription archive. This makes it easier
# for me to use my hacks on different computers. (This script
# was written on my old HP Jornada 820e, which is a nice
# mobile workhorse for me.)
#
# Just change the following line to your local convention.
my $path = $ENV{'VOYNICH'} or die 'No environment var VOYNICH';

my $outa = $path . ".pro.a.txt";
my $outb = $path . ".pro.b.txt";
my $outx = $path . ".pro.x.txt";

my ($ofa, $ofb, $ofx);

open INFILE, "<$path" or die "Failed to open $path $!";
open $ofa, ">$outa" or die "Failed to open $outa $!";
open $ofb, ">$outb" or die "Failed to open $outb $!";
open $ofx, ">$outx" or die "Failed to open $outx $!";

my ($ln, $lang, $of);

while ($ln = <INFILE>)
{
  # Skip comments
  next if ($ln =~ /^\#/); 

  # Parseable information?
  # Determine language and output file
  if ($ln =~ /^\<(f\d+[rv]\d?)\>\s*\{([^\}]+)\}/)
  {
    print "$1 ";
    my $pinf = $2;
    if ($pinf =~ /\$L=(.)/)
    {
      $lang = $1;
      $of = ($lang eq 'A') ? $ofa : $ofb;
    }
    else
    {
      $lang = 'X';
      $of = $ofx;
    }
    print "lang=$lang\n";
  }

  # Extract locator and text
  chomp $ln;
  next unless ($ln =~ /^\<([^\>]+)\>\s*(.*)$/);
  my $loc = $1;
  my $txt = $2;

  # Only Takeshi Takahashi's full transcription
  # (you may set your favorite here)
  next unless ($loc =~ /H$/); # H = Takeshi Takahashi

  # Normalize the text line and split it in words
  $txt =~ s/\{[^\}]*\}//g; # Remove inline comments
  $txt =~ s/[\,\-\=]/./g; # Consistent spacing with dots
  $txt =~ s/[^a-z\.\*]//g; # Remove unwanted transcription stuff
  $txt =~ s/\.+$//g; # Remove trailing dots
  my @wrd = split /\.+/, $txt; # Get the words.

  # And now for some strange rules to make voynichese pronouncible...
  # Of course they are willkuerlich und klingen ziemlich deutsch, 
  # denn das ist meine native language... ;-)
  my @cnv;
  foreach my $w (@wrd)
  {
    # I do my conversions to uppercase to avoid side effects in
    # the sequence of substitions.

    # in-Groups to vocal E
    $w =~ s/iiiin/EULEM/g;
    $w =~ s/iiin/EM/g;
    $w =~ s/iin/EN/g;
    $w =~ s/in/EL/g;

    # il-Groups to vocal O
    $w =~ s/iiiil/EULOM/g;
    $w =~ s/iiil/OM/g;
    $w =~ s/iil/ON/g;
    $w =~ s/il/OL/g;

    # ir-Groups to vocal A
    $w =~ s/iiiir/EULAM/g;
    $w =~ s/iiir/AM/g;
    $w =~ s/iir/AN/g;
    $w =~ s/ir/AL/g;

    # is-Groups to vocal U
    $w =~ s/iiiis/EULUM/g;
    $w =~ s/iiis/UM/g;
    $w =~ s/iis/UN/g;
    $w =~ s/is/UL/g;

    # im-Groups to vocal I
    $w =~ s/iiiim/EULIM/g;
    $w =~ s/iiim/IM/g;
    $w =~ s/iim/IN/g;
    $w =~ s/im/IL/g;

    # ee-Gruoups to dipthong AU
    $w =~ s/eeee/AUVE/g;
    $w =~ s/eee/äU/g;
    $w =~ s/ee/AU/g;

    # ch-clusters with gallow
    $w =~ s/([ci])([tkpf])([hoy])/$2ö$1$3/g;
    $w =~ s/([tkpf])(cis])h/$1ü$2h/g;

    # double h in ch-clusters
    $w =~ s/hh/hEH/g;

    # ch-clusters
    $w =~ s/ch/ST/g;
    $w =~ s/sh/SCH/g;
    $w =~ s/ih/TSCH/g;

    # initial and final y
    $w =~ s/^y/AN/g;
    $w =~ s/dy$/LICH/g;
    $w =~ s/y$/UNG/g;
    $w =~ s/y(.)$/$1EHUNG/g;

    # initial and final s
    $w =~ s/^s/SE/g;
    $w =~ s/s$/ES/g;

    # initial and final d
    $w =~ s/^d/GE/g;
    $w =~ s/d$/KEIT/g;
    $w =~ s/d(.)$/di$1/g;

    # (rare) double d
    $w =~ s/dd/DETH/g;

    # final m
    $w =~ s/mm$/SCHAFT/g;
    $w =~ s/m$/HEIT/g;

    # initial and final l
    $w =~ s/^l/ME/g;
    $w =~ s/l$/IG/g;

    # (rare) remaining i and e
    $w =~ s/e/I/g;
    $w =~ s/iii/EM/g;
    $w =~ s/ii/EN/g;
    $w =~ s/i/E/g;

    # initial q and qo
    $w =~ s/^qo/UND-/g;
    $w =~ s/^q/ODER-/g;

    # let the gallows sound more german... ;-)
    $w =~ s/p/PFE/g;
    $w =~ s/f/BE/g;
    $w =~ s/t/FE/g;
    $w =~ s/k/THE/g;

    # convert to lowercase and do some funny conversions to make the
    # pronounciation easier (for a German).
    # This process may destroy some information or make it really hard
    # to go back to the original text with a set of regular expressions.
    # This is no serious research but an attempt to make it "audible" to
    # germans.
    $w =~ s/([PFBT]+)([AEIOU])([AEIOUäöü])/$1$3/ig;
    $w =~ s/^TSCH/ETSCH/ig;
    $w =~ s/aa/ACHE/ig;
    $w =~ s/([aeio])ung/$1NUNG/ig;
    $w =~ s/([aeiu])ing/$1NING/ig;
    $w =~ s/a([aeiouäöü])/EN$1/ig;
    $w =~ s/ah/AN/ig;
    $w =~ s/oh/ON/ig;
    $w =~ s/ea/ENNA/ig;
    $w =~ s/eo/ENNO/ig;
    $w =~ s/oa/ONNA/ig;
    $w =~ s/auung/AHNUNG/ig;
    $w =~ s/eü/EHü/ig;
    $w =~ s/fung/FANG/ig;
    $w =~ s/chd/CHID/ig;
    $w =~ s/uo/UH/ig;
    $w =~ s/ou/OH/ig;
    $w =~ s/ue/UH/ig;
    $w =~ s/tt/TET/ig;
    $w =~ s/^ge([aeiou])/G$1/i;
    $w =~ s/st([pfbt]+)/STE$1/gi;
    $w =~ s/ying$/LING/i;
    $w =~ s/g/LICHT/g;

    push @cnv, lc $w;
  }

  # Remove the (now wrong) transcriber code from the locator
  $loc =~ s/\;.$//;

  # Write the converted line to the output file
  my $cln = sprintf ("<%s> %s", $loc, join (' ', @cnv));
  print $of "$cln\n";  
}
close $ofx;
close $ofb;
close $ofa;
close INFILE;

# That's all