# Perl script to process Japanese hyphenation
# This way, generated text file can prevent it.
# 

# all in 1 file created by using debiandoc2html -1
my $allin1 = "debian-faq.html/index.html";

# temporary file for w3m to parse
my $tempfile = "debian-faq-temp.html";

# cannot be the last character of the line
my $e = '(「';

# cannot be the first....
my $s = '、。んぁぃぅぇぉっゃゅょンァィゥェォッャュョー」),\?';

# each chapter has anchors (toc, prev, next, 1, 2, ...)
# this is useless in the text version
my $t = '前のページ|次のページ|目次|\d+';

# decorative tags
my $tags = 'em|strong|i|b|samp|tt|code';

# the actual contents
my $m = '';

# read the all in 1 file
open (my $o,"<",$allin1);
while(<$o>){$m.=$_;}
close $o;

# flag utf-8
$m = p($m);
$e = p($e);
$s = p($s);
$t = p($t);

my $r = '<a href="#';
my $h = "<hr>\n";

$m =~ s/[\r\n]+/\n/g;

# remove anchors
$m =~ s/$h<p>\n(\[ ($r(ch-[a-z_]+|contents|index)">)?($t)(\<\/a>)? \]\n)+<\/p>\n$h/$h/g;

# do not break entity in the next hyphenation process
$m =~ s/&(quot|amp|gt|lt|copy);/<nobr>&$1;<\/nobr>/g;

# remove decorative tags
$m =~ s/<\/?($tags)>//g;

# Japanese hyphenation
$m =~ s/([^a-zA-Z0-9<>$s][$s]+)/<nobr>$1<\/nobr>/g;
$m =~ s/([ .,-])([a-zA-Z0-9]+[$s]+)/$1<nobr>$2<\/nobr>/g;
$m =~ s/((!<<nobr>)[^a-zA-Z0-9])([a-zA-Z0-9]+[$s]+)/$1<nobr>$2<\/nobr>/g;
$m =~ s/(<\/nobr>|\G)([a-zA-Z0-9]+[$s]+)/$1<nobr>$2<\/nobr>/g;
$m =~ s/([$e]+)<nobr>/<nobr>$1/g;
$m =~ s/([$e]+([a-zA-Z0-9]+)?[$s]+)/<nobr>$1<\/nobr>/g;

# write to temporary file
open (my $o,">:utf8",$tempfile);
print $o $m;
close $o;

# utf-8 flag
sub p{my $c=shift; return (pack("U0C*", unpack("C*", $c)));}
