#!/usr/bin/perl
# mkreader - a perl script to format REED records queried from a database
# early draft to prepare a reader's package .rtf file from an .html export
# of ASCII coded file
use HTML::FormatRTF;
use RTF::Writer;
use HTML::Entities;
# use HTML::FormatRTF to xlate html, then use RTF::Writer for better
control
# of the document
# for now, we actually are only using FormatRTF
open LOGFILE, ">logfile.reader"
or die
"Can't write-open logfile.reader: $!\nAborting";
open RECORDRTF, ">readerout.rtf"
or die
"Can't write-open readerout.rtf: $!\nAborting";
open NOTERTF, ">notesout.rtf"
or die
"Can't write-open notesout.rtf: $!\nAborting";
open WTF, ">logfile.wtf";
# DBI initializations to point us at the correct datasource
$driver = "mysql";
$database = "reed";
$hostname = "localhost";
$dbuser = "reed";
$dbpwd = "reed";
use DBI;
$dsn =
"DBI:$driver:database=$database;host=$hostname";
$dbh = DBI->connect( $dsn, $dbuser, $dbpwd );
$recordsquery = "select * from reed ";
$notesquery = "select * from footnotes";
# prepare the records query
$dbrsth = $dbh->prepare( "
$recordsquery
" );
# prepare the notes query
$dbnsth = $dbh->prepare( "
$notesquery
" );
# execute the records query
$dbrsth->execute;
$templatehead = '{\rtf1\ansi\deff0{\fonttbl
{\f0\fnil\fcharset0 Times New Roman;}
{\f1\fnil\fcharset128 Arial Unicode MS;}
}
{\colortbl;\red255\green0\blue0;\red0\green0\blue255;}
{\info
{\creatim \yr2008\mo7\dy23\hr21\min10\sec55}
{\revtim \yr2008\mo7\dy23\hr21\min10\sec55}
{\title Inns of Court Readers Package [July 2008] The Records}
{\author Alan Nelson}
{\comment Draft RTF conversion for Digital Startup Pilot Project}
{\doccomm written by \'2e/writeREED [Perl RT\'46::Writer v1\'2e11]}
}
{\header\pard \tqr\tx9360 \brdrb \brdrs \brdrw10 \brsp24 \qr\plain\f0{Inns
of Court Readers Package [July 2008] \endash The Records
\tab\tqr}\chpgn\par}
\paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0
\deftab1440
\widowctl
\f0
';
$notehead = '{\rtf1\ansi\deff0{\fonttbl
{\f0\fnil\fcharset0 Times New Roman;}
{\f1\fnil\fcharset128 Arial Unicode MS;}
}
{\colortbl;\red255\green0\blue0;\red0\green0\blue255;}
{\info
{\creatim \yr2008\mo7\dy23\hr21\min10\sec55}
{\revtim \yr2008\mo7\dy23\hr21\min10\sec55}
{\title Inns of Court Footnotes}
{\author Alan Nelson}
{\comment Draft RTF conversion for Digital Startup Pilot Project}
{\doccomm written by \'2e/writeREED [Perl RT\'46::Writer v1\'2e11]}
}
{\header\pard \tqr\tx9360 \brdrb \brdrs \brdrw10 \brsp24 \qr\plain\f0{Inns
of Court \endash Footnotes \tab\tqr}\chpgn\par}
\paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0
\deftab960
\widowctl
\f0
';
print LOGFILE "RECORDSQUERY: $recordsquery\n";
while ( $hash_ref =
$dbrsth->fetchrow_hashref('NAME_lc') )
{
# clear things out & use variable copies instead of hash refs
$id = $hash_ref->{id};
$description = $hash_ref->{description};
$repository = $hash_ref->{repository};
$docdate = $hash_ref->{docdate};
$text = $hash_ref->{text};
$printedsource = $hash_ref->{printedsource};
print LOGFILE "TEXT: <<$text\n>>\n\n";
print WTF "CLEANTEXT: <<$text\n>>\n\n";
# we're deleting the entire heading, and constructing our own
if ($printedsource) {
$printedsourcething =
"$printedsource
";
}
else { $printedsourcething = ""; }
$text =~
s/
/[KEEPN]$printedsourcething\n$docdate<\/b>
$description<\/B><\/I>
$repository
$foliation<\/p>[\/KEEPN]/s;
# fiddle the markup
$text = textfiddle($text);
print LOGFILE "FIDDLETEXT: <<$text>>\n\n";
# add it to the outgoing string
$outtext .= $text;
print LOGFILE
"Here it goes\n$outtext\nENDIT\n\n";
} # end of the while loop
# format the outgoing text
$formattedtext = HTML::FormatRTF->format_string(
$outtext,
'normal_halfpoint_size' => 24,
'header_halfpoint_size' => 24,
'head1_halfpoint_size' => 24,
'head2_halfpoint_size' => 24,
'head3_halfpoint_size' => 24,
'fontname_body' => 'Times New Roman',
'fontname_headings' => 'Times New Roman',
);
# fiddle the outgoing formatted text
$formattedtext = rtffiddle($formattedtext);
# sub new templatehead
$formattedtext =~ s/^.*?fs22\n/$templatehead/s;
print RECORDRTF $formattedtext;
close(RECORDRTF);
# footnotes
# execute the footnotesquery
$dbnsth->execute;
# log it
print LOGFILE "NOTESQUERY: $notesquery\n";
# open the ordered list
#$note = "\n";
# loop through the returned rows
while ( $hash_ref =
$dbnsth->fetchrow_hashref('NAME_lc') )
{
$fnum = $hash_ref->{fnum};
$fnote = $hash_ref->{fnote};
# clear out carriage returns, grr
$fnote =~ s/
//g;
print LOGFILE "$fnum: $fnote\n";
#$note = "";
# ordered list...hmmm
#$note = "
- $fnote<\/li>\n";
#$note = "
$fnum\[TAB]$fnote<\/p>\n";
$note =
"
$fnum<\/hi>[TAB]$fnote<\/p>\n";
$note = textfiddle($note);
print LOGFILE "$note\n";
$outnote .= $note;
}
# close the ordered list
#$note .= "<\/ol>";
#$note .= "
";
# format as RTF
$formattednote =
HTML::FormatRTF->format_string($outnote);
# fiddle the RTF
$formattednote = rtffiddle($formattednote);
# sub new templatehead
$formattednote =~ s/^.*?fs22\n/$notehead/s;
$formattednote =~ s/\\li0/\\li960\\fi-960/g;
$formattednote =~ s/\\sa220//g;
print NOTERTF $formattednote;
close(NOTERTF);
# END OF THE LINE
# subroutines
sub textfiddle() {
print WTF "RAW: $_[0]\n>>\n";
$mytext = $_[0];
print WTF "PASSED: $_[0]\n>>\n";
# left marginales
$mytext =~ s/@l\\(.*?)@l \\/$1 []/;
# right marginales
$mytext =~ s/@r\\(.*?)@r \\/$1 []/;
# hanging left thingy...
$mytext =~
s/(.*?)<\/p>/
$1<\/p>/g;
# headings
#$mytext =~ s/
(.*?)<\/head>/$1/g;
#$mytext =~ s/(.*?)<\/head>/$1/g;
# this has to be done outside the function
# now, we're deleting the entire heading, and constructing our own
#if ($printedsource) {
# $printedsourcething = "$printedsource
";
# }
#else { $printedsourcething = ""; }
#$mytext =~ s//[KEEPN]$printedsourcething\n$docdate<\/b>
$description<\/B><\/I>
$repository
$foliation<\/p>[\/KEEPN]/s;
# line breaks
$mytext =~ s//
/g;
# formatting
$mytext =~ s/<\/*note[^>]*>//g;
$mytext =~ s/--/\/g;
$mytext =~
s/(.*?)<\/hi><\/hi>/$1<\/b><\/i>/g;
$mytext =~
s/(.*?)<\/hi><\/hi>/##SCAPS#$1#SCAPS##<\/i>/g;
$mytext =~
s/(.*?)<\/emph>/$1<\/i>/g;
$mytext =~
s/(.*?)<\/hi>/$1<\/b>/g;
$mytext =~
s/(.*?)<\/hi>/$1<\/i>/g;
$mytext =~
s/(.*?)<\/hi>/##SCAPS#$1#SCAPS##/g;
$mytext =~
s/(.*?)<\/hi>/$1<\/sup>/g;
$mytext =~ s/\@\[/[TAB]\@\[/g;
#$mytext =~ s/ /[PICA]@p; /g;
$mytext =~
s/ /[PICA]/g;
$mytext =~
s/(.*?)<\/hi>/$1<\/center>/g;
$mytext =~ s/(
\n)*
\n\n*<\/p>/<\/p>/g;
return $mytext;
}
# END OF TEXT FIDDLING SUB
#
sub rtffiddle() {
$tofiddle = $_[0];
# a little postprocessing: Smallcaps don't seem to work
while ( $tofiddle =~
s/##SCAPS#(.*?)#SCAPS##/{\\scaps $1}/g )
{
print LOGFILE "Gotone: $1\n";
}
# START FORMATTEDTEXTFIDDLING
$tofiddle =~ s/(\\line\n) */$1/g;
$tofiddle =~
s/{\\pard\\sa220\\li0\\ri0\\ql\\plain\n\n\\par}//g;
# more end of line lopping
$tofiddle =~ s/\\line\n*\n\\par/\\par/g;
# catch untranslated entities, turn into unicode
$tofiddle =~ s/\⌈/{\\f1\\u8968\\f0}/g
; # LEFT CEILING beginning of interlineation above the line
$tofiddle =~ s/\⌉/{\\f1\\u8969\\f0}/g
; # RIGHT CEILING end of interlineation above the line
$tofiddle =~
s/\æ/\\u230 /g; # LATIN SMALL LETTER AE
$tofiddle =~ s/\Æ/\\u198 /g
; # LATIN CAPITAL LETTER AE
$tofiddle =~ s/\⌊/{\\f1\\u8970\\f0}/g
; # LEFT FLOOR beginning of interlineation below the line ()
$tofiddle =~ s/\⌋/{\\f1\\u8971\\f0}/g
; # RIGHT FLOOR end of interlineation below the line ()
$tofiddle =~ s/\ℂ/{\\f1\\u8450\\f0}/g
; # DOUBLE-STRUCK CAPITAL C (that's not right...) code for a
capitulum ()
$tofiddle =~
s/\ð/\\u240 /g; # code for an eth ()
$tofiddle =~ s/\°/\\u176 /g
; # interpolation "bubble", naively using degree character
$tofiddle =~ s/\ŋ/\\u331 /g
; # LATIN SMALL LETTER ENG Welsh .ng. (looks like combined .nj.) ()
$tofiddle =~ s/\Ŋ/\\u330 /g
; # LATIN CAPITAL LETTER ENG Welsh .NG. (looks like combined .NJ.)
()
#$tofiddle =~ s/\œ/\\u339 /g; # LATIN SMALL LIGATURE OE oe
diphthong in ligature ()
$tofiddle =~ s/\œ/\\'9c/g
; # LATIN SMALL LIGATURE OE oe diphthong in ligature ()
$tofiddle =~
s/\…/\\'85/g; # HORIZONTAL ELLIPSIS
$tofiddle =~ s/\Œ/\\u338 /g
; # LATIN CAPITAL LIGATURE OE diphthong in ligature ()
$tofiddle =~ s/\¶/\\u182 /g
; # PILCROW SIGN paragraphus sign (¶)
$tofiddle =~ s/\þ/\\u254 /g
; # LATIN SMALL LETTER THORN code for a thorn ()
$tofiddle =~ s/\Þ/\\u222 /g
; # LATIN CAPITAL LETTER THORN code for capital thorn ()
$tofiddle =~ s/\ƿ/\\u447 /g
; # LATIN LETTER WYNN wynn ()
$tofiddle =~ s/\ȝ/\\u541 /g
; # LATIN SMALL LETTER YOGH code for yogh ()
$tofiddle =~ s/\Ȝ/\\u540 /g
; # LATIN CAPITAL LETTER YOGH code for capital yogh ()
$tofiddle =~ s/\!/\\u33 /g
; # EXCLAMATION MARK exclamation point (!)
$tofiddle =~ s/\⁁/{\\f1\\u8257\\f0}/g
; # CARET INSERTION POINT caret symbol preceding interlineation
$tofiddle =~ s/\£/\\u163 /g
; # POUND SIGN pound sign (£)
$tofiddle =~ s/\⁏/{\\f1\\u8271\\f0}/g
; # REVERSED SEMICOLON inverted semicolon ()
$tofiddle =~ s/\·/\\u183 /g
; # MIDDLE DOT raised period, like a semi-colon without the comma
part
$tofiddle =~ s/\§/\\u167 /g
; # SECTION SIGN section symbol ()
$tofiddle =~ s/\ç/\\u231 /g
; # LATIN SMALL LETTER C WITH CEDILLA
$tofiddle =~ s/\â/\\u226 /
; # LATIN SMALL LETTER A WITH CIRCUMFLEX
$tofiddle =~
s/\[PICA\]/\\sa240 /g; # PICASPACE
$tofiddle =~ s/\[EMSP\]/\\sa120 /g; # EMSPACE
$tofiddle =~
s/\[TAB\]/\\tab /g; # TAB, flush-right
# KEEPN & KEEP
$tofiddle =~ s/\n\[KEEPN\]/\\keep\\keepn\n/g;
$tofiddle =~ s/\[\/KEEPN\]//g;
$tofiddle =~ s/\\line\n*\\par/\\par/g;
return $tofiddle;
}
# WORKING NOTES/DRAFTS
# START FORMATTEDTEXTFIDDLING
if ($elephants) {
$formattedtext =~ s/(\\line\n) */$1/g;
$formattedtext =~
s/{\\pard\\sa220\\li0\\ri0\\ql\\plain\n\n\\par}//g;
# more end of line lopping
$formattedtext =~ s/\\line\n*\n\\par/\\par/g;
# catch untranslated entities, turn into unicode
$formattedtext =~
s/\⌈/{\\f1\\u8968\\f0}/g
; # LEFT CEILING beginning of interlineation above the line
$formattedtext =~
s/\⌉/{\\f1\\u8969\\f0}/g
; # RIGHT CEILING end of interlineation above the line
$formattedtext =~
s/\æ/\\u230 /g; # LATIN SMALL LETTER AE
$formattedtext =~ s/\Æ/\\u198 /g
; # LATIN CAPITAL LETTER AE
$formattedtext =~
s/\⌊/{\\f1\\u8970\\f0}/g
; # LEFT FLOOR beginning of interlineation below the line ()
$formattedtext =~
s/\⌋/{\\f1\\u8971\\f0}/g
; # RIGHT FLOOR end of interlineation below the line ()
$formattedtext =~
s/\ℂ/{\\f1\\u8450\\f0}/g
; # DOUBLE-STRUCK CAPITAL C (that's not right...) code for a
capitulum ()
$formattedtext =~
s/\ð/\\u240 /g; # code for an eth ()
$formattedtext =~ s/\°/\\u176 /g
; # interpolation "bubble", naively using degree character
$formattedtext =~ s/\ŋ/\\u331 /g
; # LATIN SMALL LETTER ENG Welsh .ng. (looks like combined .nj.) ()
$formattedtext =~ s/\Ŋ/\\u330 /g
; # LATIN CAPITAL LETTER ENG Welsh .NG. (looks like combined .NJ.)
()
$formattedtext =~ s/\œ/\\u339 /g
; # LATIN SMALL LIGATURE OE oe diphthong in ligature ()
$formattedtext =~ s/\Œ/\\u338 /g
; # LATIN CAPITAL LIGATURE OE diphthong in ligature ()
$formattedtext =~ s/\¶/\\u182 /g
; # PILCROW SIGN paragraphus sign (¶)
$formattedtext =~ s/\þ/\\u254 /g
; # LATIN SMALL LETTER THORN code for a thorn ()
$formattedtext =~ s/\Þ/\\u222 /g
; # LATIN CAPITAL LETTER THORN code for capital thorn ()
$formattedtext =~ s/\ƿ/\\u447 /g
; # LATIN LETTER WYNN wynn ()
$formattedtext =~ s/\ȝ/\\u541 /g
; # LATIN SMALL LETTER YOGH code for yogh ()
$formattedtext =~ s/\Ȝ/\\u540 /g
; # LATIN CAPITAL LETTER YOGH code for capital yogh ()
$formattedtext =~ s/\!/\\u33 /g
; # EXCLAMATION MARK exclamation point (!)
$formattedtext =~
s/\⁁/{\\f1\\u8257\\f0}/g
; # CARET INSERTION POINT caret symbol preceding interlineation
$formattedtext =~ s/\£/\\u163 /g
; # POUND SIGN pound sign (£)
$formattedtext =~
s/\⁏/{\\f1\\u8271\\f0}/g
; # REVERSED SEMICOLON inverted semicolon ()
$formattedtext =~ s/\·/\\u183 /g
; # MIDDLE DOT raised period, like a semi-colon without the comma
part
$formattedtext =~ s/\§/\\u167 /g
; # SECTION SIGN section symbol ()
$formattedtext =~ s/\ç/\\u231 /g
; # LATIN SMALL LETTER C WITH CEDILLA
$formattedtext =~ s/\â/\\u226 /
; # LATIN SMALL LETTER A WITH CIRCUMFLEX
$formattedtext =~
s/\[PICA\]/\\sa240 /g; # PICASPACE
$formattedtext =~
s/\[EMSP\]/\\sa120 /g; # EMSPACE
$formattedtext =~
s/\[TAB\]/\\tab /g; # TAB, flush-right
# KEEPN & KEEP
$formattedtext =~
s/\n\[KEEPN\]/\\keep\\keepn\n/g;
$formattedtext =~ s/\[\/KEEPN\]//g;
$formattedtext =~ s/\\line\n*\\par/\\par/g;
# sub new templatehead
$formattedtext =~
s/^.*?fs22\n/$templatehead/s;
}
# END FORMATTEDTEXTFIDDLING
# START TEXT FIDDLING
if ($elephants) {
# left marginales
$text =~ s/@l\\(.*?)@l \\/$1 []/;
# right marginales
$text =~ s/@r\\(.*?)@r \\/$1 []/;
# hanging left thingy...
$text =~
s/(.*?)<\/p>/
$1<\/p>/g;
# headings
# this is what we were doing...
#$text =~ s/
(.*?)<\/head>/$1/g;
#$text =~ s/(.*?)<\/head>/$1/g;
# now, we're deleting the entire heading, and constructing our own
if ($printedsource) {
$printedsourcething =
"$printedsource
";
}
else { $printedsourcething = ""; }
$text =~
s//[KEEPN]$printedsourcething\n$docdate<\/b>
$description<\/B><\/I>
$repository
$foliation<\/p>[\/KEEPN]/s;
# line breaks
$text =~ s//
/g;
# formatting
$text =~ s/<\/*note[^>]*>//g;
$text =~ s/--/\/g;
$text =~
s/(.*?)<\/hi><\/hi>/$1<\/b><\/i>/g;
$text =~
s/(.*?)<\/hi><\/hi>/##SCAPS#$1#SCAPS##<\/i>/g;
$text =~
s/(.*?)<\/emph>/$1<\/i>/g;
$text =~
s/(.*?)<\/hi>/$1<\/b>/g;
$text =~
s/(.*?)<\/hi>/$1<\/i>/g;
$text =~
s/(.*?)<\/hi>/##SCAPS#$1#SCAPS##/g;
$text =~
s/(.*?)<\/hi>/$1<\/sup>/g;
$text =~ s/\@\[/[TAB]\@\[/g;
#$text =~ s/ /[PICA]@p; /g;
$text =~ s/ /[PICA]/g;
$text =~
s/(.*?)<\/hi>/$1<\/center>/g;
$text =~ s/(
\n)*
\n\n*<\/p>/<\/p>/g;
}
# END TEXTFIDDLING
# copyright 2009 Records of Early English Drama