#!/usr/bin/perl # mkreader - a perl script to format REED records queried from a database # early draft to prepare a reader's package .rtf file from an .html export # of ASCII coded file use HTML::FormatRTF; use RTF::Writer; use HTML::Entities; # use HTML::FormatRTF to xlate html, then use RTF::Writer for better control # of the document # for now, we actually are only using FormatRTF open LOGFILE, ">logfile.reader" or die "Can't write-open logfile.reader: $!\nAborting"; open RECORDRTF, ">readerout.rtf" or die "Can't write-open readerout.rtf: $!\nAborting"; open NOTERTF, ">notesout.rtf" or die "Can't write-open notesout.rtf: $!\nAborting"; open WTF, ">logfile.wtf"; # DBI initializations to point us at the correct datasource $driver = "mysql"; $database = "reed"; $hostname = "localhost"; $dbuser = "reed"; $dbpwd = "reed"; use DBI; $dsn = "DBI:$driver:database=$database;host=$hostname"; $dbh = DBI->connect( $dsn, $dbuser, $dbpwd ); $recordsquery = "select * from reed "; $notesquery = "select * from footnotes"; # prepare the records query $dbrsth = $dbh->prepare( " $recordsquery " ); # prepare the notes query $dbnsth = $dbh->prepare( " $notesquery " ); # execute the records query $dbrsth->execute; $templatehead = '{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Times New Roman;} {\f1\fnil\fcharset128 Arial Unicode MS;} } {\colortbl;\red255\green0\blue0;\red0\green0\blue255;} {\info {\creatim \yr2008\mo7\dy23\hr21\min10\sec55} {\revtim \yr2008\mo7\dy23\hr21\min10\sec55} {\title Inns of Court Readers Package [July 2008] The Records} {\author Alan Nelson} {\comment Draft RTF conversion for Digital Startup Pilot Project} {\doccomm written by \'2e/writeREED [Perl RT\'46::Writer v1\'2e11]} } {\header\pard \tqr\tx9360 \brdrb \brdrs \brdrw10 \brsp24 \qr\plain\f0{Inns of Court Readers Package [July 2008] \endash The Records \tab\tqr}\chpgn\par} \paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0 \deftab1440 \widowctl \f0 '; $notehead = '{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Times New Roman;} {\f1\fnil\fcharset128 Arial Unicode MS;} } {\colortbl;\red255\green0\blue0;\red0\green0\blue255;} {\info {\creatim \yr2008\mo7\dy23\hr21\min10\sec55} {\revtim \yr2008\mo7\dy23\hr21\min10\sec55} {\title Inns of Court Footnotes} {\author Alan Nelson} {\comment Draft RTF conversion for Digital Startup Pilot Project} {\doccomm written by \'2e/writeREED [Perl RT\'46::Writer v1\'2e11]} } {\header\pard \tqr\tx9360 \brdrb \brdrs \brdrw10 \brsp24 \qr\plain\f0{Inns of Court \endash Footnotes \tab\tqr}\chpgn\par} \paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0 \deftab960 \widowctl \f0 '; print LOGFILE "RECORDSQUERY: $recordsquery\n"; while ( $hash_ref = $dbrsth->fetchrow_hashref('NAME_lc') ) { # clear things out & use variable copies instead of hash refs $id = $hash_ref->{id}; $description = $hash_ref->{description}; $repository = $hash_ref->{repository}; $docdate = $hash_ref->{docdate}; $text = $hash_ref->{text}; $printedsource = $hash_ref->{printedsource}; print LOGFILE "TEXT: <<$text\n>>\n\n"; print WTF "CLEANTEXT: <<$text\n>>\n\n"; # we're deleting the entire heading, and constructing our own if ($printedsource) { $printedsourcething = "$printedsource
"; } else { $printedsourcething = ""; } $text =~ s//

[KEEPN]$printedsourcething\n$docdate<\/b>
$description<\/B><\/I> $repository
$foliation<\/p>[\/KEEPN]/s; # fiddle the markup $text = textfiddle($text); print LOGFILE "FIDDLETEXT: <<$text>>\n\n"; # add it to the outgoing string $outtext .= $text; print LOGFILE "Here it goes\n$outtext\nENDIT\n\n"; } # end of the while loop # format the outgoing text $formattedtext = HTML::FormatRTF->format_string( $outtext, 'normal_halfpoint_size' => 24, 'header_halfpoint_size' => 24, 'head1_halfpoint_size' => 24, 'head2_halfpoint_size' => 24, 'head3_halfpoint_size' => 24, 'fontname_body' => 'Times New Roman', 'fontname_headings' => 'Times New Roman', ); # fiddle the outgoing formatted text $formattedtext = rtffiddle($formattedtext); # sub new templatehead $formattedtext =~ s/^.*?fs22\n/$templatehead/s; print RECORDRTF $formattedtext; close(RECORDRTF); # footnotes # execute the footnotesquery $dbnsth->execute; # log it print LOGFILE "NOTESQUERY: $notesquery\n"; # open the ordered list #$note = "

    \n"; # loop through the returned rows while ( $hash_ref = $dbnsth->fetchrow_hashref('NAME_lc') ) { $fnum = $hash_ref->{fnum}; $fnote = $hash_ref->{fnote}; # clear out carriage returns, grr $fnote =~ s/ //g; print LOGFILE "$fnum: $fnote\n"; #$note = "

    "; # ordered list...hmmm #$note = "

  1. $fnote<\/li>\n"; #$note = "

    $fnum\[TAB]$fnote<\/p>\n"; $note = "

    $fnum<\/hi>[TAB]$fnote<\/p>\n"; $note = textfiddle($note); print LOGFILE "$note\n"; $outnote .= $note; } # close the ordered list #$note .= "<\/ol>"; #$note .= "

    "; # format as RTF $formattednote = HTML::FormatRTF->format_string($outnote); # fiddle the RTF $formattednote = rtffiddle($formattednote); # sub new templatehead $formattednote =~ s/^.*?fs22\n/$notehead/s; $formattednote =~ s/\\li0/\\li960\\fi-960/g; $formattednote =~ s/\\sa220//g; print NOTERTF $formattednote; close(NOTERTF); # END OF THE LINE # subroutines sub textfiddle() { print WTF "RAW: $_[0]\n>>\n"; $mytext = $_[0]; print WTF "PASSED: $_[0]\n>>\n"; # left marginales $mytext =~ s/@l\\(.*?)@l \\/$1 []/; # right marginales $mytext =~ s/@r\\(.*?)@r \\/$1 []/; # hanging left thingy... $mytext =~ s/

    (.*?)<\/p>/

    $1<\/p>/g; # headings #$mytext =~ s/(.*?)<\/head>/$1/g; #$mytext =~ s/(.*?)<\/head>/$1/g; # this has to be done outside the function # now, we're deleting the entire heading, and constructing our own #if ($printedsource) { # $printedsourcething = "$printedsource
    "; # } #else { $printedsourcething = ""; } #$mytext =~ s//

    [KEEPN]$printedsourcething\n$docdate<\/b>
    $description<\/B><\/I> $repository
    $foliation<\/p>[\/KEEPN]/s; # line breaks $mytext =~ s//
    /g; # formatting $mytext =~ s/<\/*note[^>]*>//g; $mytext =~ s/--/\–/g; $mytext =~ s/(.*?)<\/hi><\/hi>/$1<\/b><\/i>/g; $mytext =~ s/(.*?)<\/hi><\/hi>/##SCAPS#$1#SCAPS##<\/i>/g; $mytext =~ s/(.*?)<\/emph>/$1<\/i>/g; $mytext =~ s/(.*?)<\/hi>/$1<\/b>/g; $mytext =~ s/(.*?)<\/hi>/$1<\/i>/g; $mytext =~ s/(.*?)<\/hi>/##SCAPS#$1#SCAPS##/g; $mytext =~ s/(.*?)<\/hi>/$1<\/sup>/g; $mytext =~ s/\@\[/[TAB]\@\[/g; #$mytext =~ s/ /[PICA]@p; /g; $mytext =~ s/ /[PICA]/g; $mytext =~ s/(.*?)<\/hi>/

    $1<\/center>/g; $mytext =~ s/(
    \n)*
    \n\n*<\/p>/<\/p>/g; return $mytext; } # END OF TEXT FIDDLING SUB # sub rtffiddle() { $tofiddle = $_[0]; # a little postprocessing: Smallcaps don't seem to work while ( $tofiddle =~ s/##SCAPS#(.*?)#SCAPS##/{\\scaps $1}/g ) { print LOGFILE "Gotone: $1\n"; } # START FORMATTEDTEXTFIDDLING $tofiddle =~ s/(\\line\n) */$1/g; $tofiddle =~ s/{\\pard\\sa220\\li0\\ri0\\ql\\plain\n\n\\par}//g; # more end of line lopping $tofiddle =~ s/\\line\n*\n\\par/\\par/g; # catch untranslated entities, turn into unicode $tofiddle =~ s/\⌈/{\\f1\\u8968\\f0}/g ; # LEFT CEILING beginning of interlineation above the line $tofiddle =~ s/\⌉/{\\f1\\u8969\\f0}/g ; # RIGHT CEILING end of interlineation above the line $tofiddle =~ s/\æ/\\u230 /g; # LATIN SMALL LETTER AE $tofiddle =~ s/\Æ/\\u198 /g ; # LATIN CAPITAL LETTER AE $tofiddle =~ s/\⌊/{\\f1\\u8970\\f0}/g ; # LEFT FLOOR beginning of interlineation below the line () $tofiddle =~ s/\⌋/{\\f1\\u8971\\f0}/g ; # RIGHT FLOOR end of interlineation below the line () $tofiddle =~ s/\ℂ/{\\f1\\u8450\\f0}/g ; # DOUBLE-STRUCK CAPITAL C (that's not right...) code for a capitulum () $tofiddle =~ s/\ð/\\u240 /g; # code for an eth () $tofiddle =~ s/\°/\\u176 /g ; # interpolation "bubble", naively using degree character $tofiddle =~ s/\ŋ/\\u331 /g ; # LATIN SMALL LETTER ENG Welsh .ng. (looks like combined .nj.) () $tofiddle =~ s/\Ŋ/\\u330 /g ; # LATIN CAPITAL LETTER ENG Welsh .NG. (looks like combined .NJ.) () #$tofiddle =~ s/\œ/\\u339 /g; # LATIN SMALL LIGATURE OE oe diphthong in ligature () $tofiddle =~ s/\œ/\\'9c/g ; # LATIN SMALL LIGATURE OE oe diphthong in ligature () $tofiddle =~ s/\…/\\'85/g; # HORIZONTAL ELLIPSIS $tofiddle =~ s/\Œ/\\u338 /g ; # LATIN CAPITAL LIGATURE OE diphthong in ligature () $tofiddle =~ s/\¶/\\u182 /g ; # PILCROW SIGN paragraphus sign (¶) $tofiddle =~ s/\þ/\\u254 /g ; # LATIN SMALL LETTER THORN code for a thorn () $tofiddle =~ s/\Þ/\\u222 /g ; # LATIN CAPITAL LETTER THORN code for capital thorn () $tofiddle =~ s/\ƿ/\\u447 /g ; # LATIN LETTER WYNN wynn () $tofiddle =~ s/\ȝ/\\u541 /g ; # LATIN SMALL LETTER YOGH code for yogh () $tofiddle =~ s/\Ȝ/\\u540 /g ; # LATIN CAPITAL LETTER YOGH code for capital yogh () $tofiddle =~ s/\!/\\u33 /g ; # EXCLAMATION MARK exclamation point (!) $tofiddle =~ s/\⁁/{\\f1\\u8257\\f0}/g ; # CARET INSERTION POINT caret symbol preceding interlineation $tofiddle =~ s/\£/\\u163 /g ; # POUND SIGN pound sign (£) $tofiddle =~ s/\⁏/{\\f1\\u8271\\f0}/g ; # REVERSED SEMICOLON inverted semicolon () $tofiddle =~ s/\·/\\u183 /g ; # MIDDLE DOT raised period, like a semi-colon without the comma part $tofiddle =~ s/\§/\\u167 /g ; # SECTION SIGN section symbol () $tofiddle =~ s/\ç/\\u231 /g ; # LATIN SMALL LETTER C WITH CEDILLA $tofiddle =~ s/\â/\\u226 / ; # LATIN SMALL LETTER A WITH CIRCUMFLEX $tofiddle =~ s/\[PICA\]/\\sa240 /g; # PICASPACE $tofiddle =~ s/\[EMSP\]/\\sa120 /g; # EMSPACE $tofiddle =~ s/\[TAB\]/\\tab /g; # TAB, flush-right # KEEPN & KEEP $tofiddle =~ s/\n\[KEEPN\]/\\keep\\keepn\n/g; $tofiddle =~ s/\[\/KEEPN\]//g; $tofiddle =~ s/\\line\n*\\par/\\par/g; return $tofiddle; } # WORKING NOTES/DRAFTS # START FORMATTEDTEXTFIDDLING if ($elephants) { $formattedtext =~ s/(\\line\n) */$1/g; $formattedtext =~ s/{\\pard\\sa220\\li0\\ri0\\ql\\plain\n\n\\par}//g; # more end of line lopping $formattedtext =~ s/\\line\n*\n\\par/\\par/g; # catch untranslated entities, turn into unicode $formattedtext =~ s/\⌈/{\\f1\\u8968\\f0}/g ; # LEFT CEILING beginning of interlineation above the line $formattedtext =~ s/\⌉/{\\f1\\u8969\\f0}/g ; # RIGHT CEILING end of interlineation above the line $formattedtext =~ s/\æ/\\u230 /g; # LATIN SMALL LETTER AE $formattedtext =~ s/\Æ/\\u198 /g ; # LATIN CAPITAL LETTER AE $formattedtext =~ s/\⌊/{\\f1\\u8970\\f0}/g ; # LEFT FLOOR beginning of interlineation below the line () $formattedtext =~ s/\⌋/{\\f1\\u8971\\f0}/g ; # RIGHT FLOOR end of interlineation below the line () $formattedtext =~ s/\ℂ/{\\f1\\u8450\\f0}/g ; # DOUBLE-STRUCK CAPITAL C (that's not right...) code for a capitulum () $formattedtext =~ s/\ð/\\u240 /g; # code for an eth () $formattedtext =~ s/\°/\\u176 /g ; # interpolation "bubble", naively using degree character $formattedtext =~ s/\ŋ/\\u331 /g ; # LATIN SMALL LETTER ENG Welsh .ng. (looks like combined .nj.) () $formattedtext =~ s/\Ŋ/\\u330 /g ; # LATIN CAPITAL LETTER ENG Welsh .NG. (looks like combined .NJ.) () $formattedtext =~ s/\œ/\\u339 /g ; # LATIN SMALL LIGATURE OE oe diphthong in ligature () $formattedtext =~ s/\Œ/\\u338 /g ; # LATIN CAPITAL LIGATURE OE diphthong in ligature () $formattedtext =~ s/\¶/\\u182 /g ; # PILCROW SIGN paragraphus sign (¶) $formattedtext =~ s/\þ/\\u254 /g ; # LATIN SMALL LETTER THORN code for a thorn () $formattedtext =~ s/\Þ/\\u222 /g ; # LATIN CAPITAL LETTER THORN code for capital thorn () $formattedtext =~ s/\ƿ/\\u447 /g ; # LATIN LETTER WYNN wynn () $formattedtext =~ s/\ȝ/\\u541 /g ; # LATIN SMALL LETTER YOGH code for yogh () $formattedtext =~ s/\Ȝ/\\u540 /g ; # LATIN CAPITAL LETTER YOGH code for capital yogh () $formattedtext =~ s/\!/\\u33 /g ; # EXCLAMATION MARK exclamation point (!) $formattedtext =~ s/\⁁/{\\f1\\u8257\\f0}/g ; # CARET INSERTION POINT caret symbol preceding interlineation $formattedtext =~ s/\£/\\u163 /g ; # POUND SIGN pound sign (£) $formattedtext =~ s/\⁏/{\\f1\\u8271\\f0}/g ; # REVERSED SEMICOLON inverted semicolon () $formattedtext =~ s/\·/\\u183 /g ; # MIDDLE DOT raised period, like a semi-colon without the comma part $formattedtext =~ s/\§/\\u167 /g ; # SECTION SIGN section symbol () $formattedtext =~ s/\ç/\\u231 /g ; # LATIN SMALL LETTER C WITH CEDILLA $formattedtext =~ s/\â/\\u226 / ; # LATIN SMALL LETTER A WITH CIRCUMFLEX $formattedtext =~ s/\[PICA\]/\\sa240 /g; # PICASPACE $formattedtext =~ s/\[EMSP\]/\\sa120 /g; # EMSPACE $formattedtext =~ s/\[TAB\]/\\tab /g; # TAB, flush-right # KEEPN & KEEP $formattedtext =~ s/\n\[KEEPN\]/\\keep\\keepn\n/g; $formattedtext =~ s/\[\/KEEPN\]//g; $formattedtext =~ s/\\line\n*\\par/\\par/g; # sub new templatehead $formattedtext =~ s/^.*?fs22\n/$templatehead/s; } # END FORMATTEDTEXTFIDDLING # START TEXT FIDDLING if ($elephants) { # left marginales $text =~ s/@l\\(.*?)@l \\/$1 []/; # right marginales $text =~ s/@r\\(.*?)@r \\/$1 []/; # hanging left thingy... $text =~ s/

    (.*?)<\/p>/

    $1<\/p>/g; # headings # this is what we were doing... #$text =~ s/(.*?)<\/head>/$1/g; #$text =~ s/(.*?)<\/head>/$1/g; # now, we're deleting the entire heading, and constructing our own if ($printedsource) { $printedsourcething = "$printedsource
    "; } else { $printedsourcething = ""; } $text =~ s//

    [KEEPN]$printedsourcething\n$docdate<\/b>
    $description<\/B><\/I> $repository
    $foliation<\/p>[\/KEEPN]/s; # line breaks $text =~ s//
    /g; # formatting $text =~ s/<\/*note[^>]*>//g; $text =~ s/--/\–/g; $text =~ s/(.*?)<\/hi><\/hi>/$1<\/b><\/i>/g; $text =~ s/(.*?)<\/hi><\/hi>/##SCAPS#$1#SCAPS##<\/i>/g; $text =~ s/(.*?)<\/emph>/$1<\/i>/g; $text =~ s/(.*?)<\/hi>/$1<\/b>/g; $text =~ s/(.*?)<\/hi>/$1<\/i>/g; $text =~ s/(.*?)<\/hi>/##SCAPS#$1#SCAPS##/g; $text =~ s/(.*?)<\/hi>/$1<\/sup>/g; $text =~ s/\@\[/[TAB]\@\[/g; #$text =~ s/ /[PICA]@p; /g; $text =~ s/ /[PICA]/g; $text =~ s/(.*?)<\/hi>/

    $1<\/center>/g; $text =~ s/(
    \n)*
    \n\n*<\/p>/<\/p>/g; } # END TEXTFIDDLING # copyright 2009 Records of Early English Drama