#!/usr/bin/perl
# papyrus - a perl script to parse REED records & chunk them into a
database
# some additional translation of typesetting codes into xhtml is done
use Date::Calc;
open LOGFILE, ">logfile";
open TESTFILE, ">testfile";
open WTF, ">wtffile";
# DBI initializations to point us at the correct datasource
$driver = "mysql";
$database = "reed";
$hostname = "localhost";
$dbuser = "reed";
$dbpwd = ""; # enter password here
use DBI;
$dsn =
"DBI:$driver:database=$database;host=$hostname";
$dbh = DBI->connect( $dsn, $dbuser, $dbpwd );
# the doccodes query
$dch = $dbh->prepare(
"select description, repository, location from doccodes where code like ?
group by code"
);
# some volume initialization
# $id, $paranum, $volume, $volnum, $editor, $text, $rawtext);
$volume = "Inns of Court";
$volnum = 1;
$editor = "Alan H. Nelson";
# where are the footnotes? make a quick array of the files containing
them
#@fnfiles = ("IOCFootnotes1(start-1570).txt",
"IOCFootnotes2(1570-1600).txt", "IOCFootnotes3(1600-end).txt");
@fnfiles = (
"/stmary/shared/epub/reed/workspace/test/junedata/IOCFN.txt"
);
# set this to turn off/on footnotes handling
$footnotesok = 1;
# start these off blank
$rawtext = $text = "";
# make the reed documents table
$mkreed2 = "CREATE TABLE reed2 (
id int(8) NOT NULL auto_increment,
paranum int(11) NOT NULL default 0,
text longtext,
rawtext longtext,
docdate varchar(255) NOT NULL ,
description varchar(255) NOT NULL,
repository varchar(255) NOT NULL,
foliation varchar(255) NOT NULL,
edsubhead longtext,
sourcetype varchar(255) NOT NULL,
volume varchar(255) NOT NULL ,
volnum tinyint(4) NOT NULL default 0,
editor varchar(255) NOT NULL ,
doctype varchar(255) NOT NULL ,
section varchar(255) NOT NULL ,
docname varchar(255) NOT NULL ,
docpage varchar(255) NOT NULL ,
PRIMARY KEY (id)
) TYPE=MyISAM;";
## make the footnotes table
$mkfn = "create table footnotes (
id int not null,
fnum varchar(5),
fnote longtext,
rawtext longtext,
primary key (id),
key (fnum)
) TYPE=MyISAM;";
# make the doccodes table; we don't usually do this,
# since it should remain static, more or less. For
# the record.
$mkdc = "DROP TABLE IF EXISTS `reed`.`doccodes`;
CREATE TABLE `reed`.`doccodes` (
`code` varchar(8) NOT NULL default '',
`description` varchar(255) NOT NULL default '',
`repository` varchar(255) NOT NULL default '',
`loccode` varchar(12) NOT NULL default '',
`location` varchar(45) NOT NULL default '',
`sortorder` varchar(45) NOT NULL default '',
`id` int(10) unsigned NOT NULL auto_increment,
PRIMARY KEY (`id`,`sortorder`,`code`),
KEY `loccode` (`loccode`)
) TYPE=MyISAM;";
# note that this clear-out is redundant, since we DROP TABLE IF EXISTS
above...
# clear out the past version of the reed table (and notes tables here too)
# & start fresh
$dbh->do("drop table reed2;");
$dbh->do("drop table footnotes;");
$dbh->do($mkreed2);
$dbh->do($mkfn);
# xlation hash
# 10 Mar 09
# some/all of the begin & end pairs should be removed from the hash, and
# treated as grouping replacements.
# the hash xltn should be part of a larger translation subroutine, which
handles
# both the elements & the structural markup
# XML TEI-Lite xlation hash
%xmlxltn = (
'\.\.\.', '…', # ELLIPSIS
'{', '', # italics ON
'}', '', # italics OFF
#'--', '–', # EN DASH -- do this afterwards, though --
comments complicate things
'@a\\', '⌈'
, # LEFT CEILING beginning of interlineation above the line
'@a \\', '⌉'
, # RIGHT CEILING end of interlineation above the line
'@ ae', 'æ', # LATIN SMALL LETTER AE
'@ AE', 'Æ', # LATIN CAPITAL LETTER AE
'@b\\', '⌊'
, # LEFT FLOOR beginning of interlineation below the line ()
'@b \\', '⌋'
, # RIGHT FLOOR end of interlineation below the line ()
'@C', 'ℂ'
, # DOUBLE-STRUCK CAPITAL C (that's not right...) code for a capitulum
()
'@d', 'ð', # code for an eth ()
'@e\\', '', # begin boldface
'@e \\', '', # return to roman
'@g\\',
'', # begin greek
'@g \\', '', # return to roman
#'@i\\' => '°', # interpolation "bubble", naively using degree
character
'@i' => '°'
, # interpolation "bubble", naively using degree character
'@j\\', ''
, # begin bold italics
'@j \\', '', # return to roman
'@k\\',
'', # begin small caps
'@k \\', '', # return to roman
'@m\\',
'', # begins centred text
'@m \\', '', # ends centred text
'@n', 'ŋ'
, # LATIN SMALL LETTER ENG Welsh .ng. (looks like combined .nj.) ()
'@N', 'Ŋ'
, # LATIN CAPITAL LETTER ENG Welsh .NG. (looks like combined .NJ.) ()
'@oe', 'œ'
, # LATIN SMALL LIGATURE OE oe diphthong in ligature ()
'@OE', 'Œ'
, # LATIN CAPITAL LIGATURE OE diphthong in ligature ()
'@P',
'¶', # PILCROW SIGN paragraphus sign (¶)
'@q\\',
''
, # begin italic small caps
'@q \\', ''
, # return to roman N.B. everything within the @q pair must be typed
in upper case characters or the conversion won’t work
'@s\\', ''
, # begins superscription
'@s \\', '', # ends superscription
'@th', 'þ'
, # LATIN SMALL LETTER THORN code for a thorn ()
'@TH', 'Þ'
, # LATIN CAPITAL LETTER THORN code for capital thorn ()
'@v', 'EFD;'
, # LATIN SMALL LETTER MIDDLE-WELSH V; CAP=1EFC Welsh .special v. --
proposed UCS entity; no codepoint yet; MUFI entitiy 2.0a 1EFC, 1EFD
'@y', 'ƿ', # LATIN LETTER WYNN wynn ()
'@z', 'ȝ'
, # LATIN SMALL LETTER YOGH code for yogh ()
'@Z', 'Ȝ'
, # LATIN CAPITAL LETTER YOGH code for capital yogh ()
'@!', '!'
, # EXCLAMATION MARK exclamation point (!)
'\^', '⁁'
, # CARET INSERTION POINT caret symbol preceding interlineation
'@\$', '£', # POUND SIGN pound sign (£)
'@;', '⁏'
, # REVERSED SEMICOLON inverted semicolon ()
#'@\[', '<-- FLUSHRIGHT -->', # ALIGNMENT info -- tab (used in accounts
before flush-right sum)
'@\*', '·'
, # MIDDLE DOT raised period, like a semi-colon without the comma part
'@\%',
'§', # SECTION SIGN section symbol ()
'@u', '<-- HALFPICASPACE --> '
, # half-pica space # probably don't need to have this, since it's
structural
'@p', '<-- PICASPACE --> ', # pica space
#'\!', '', # hard carriage return on the typesetter
'@\?c',
'ç', # LATIN SMALL LETTER C WITH CEDILLA
'@"',
'ñ', # LATIN SMALL LETTER N WITH CEDILLA
'@\^a',
'â' # LATIN SMALL LETTER A WITH CIRCUMFLEX
);
# end of xmlxltn hash
#####################
#begin footnotes
if ($footnotesok) {
# note: @f\#\ and @i\ are fiddled in the main text; I don't think they're
used in the footnotes
foreach $fnfile (@fnfiles) {
open( FOOTNOTES, "< $fnfile" )
or die
"Couldn't open that; sorry: $!\n";
print "Opening $fnfile\n";
while () {
chomp;
next if /^--/;
$fnid++;
$fnraw = $_;
# the easy translations
foreach $string ( keys %xmlxltn ) {
# turns out we do want doublequoting of the keys
$qqfindstring = qq{$string};
# and we want to escape trailing slashes in the keys
$qqfindstring =~ s/\\$/\\\\/;
# replace all occurrences of the quoted, escaped string with its
# value in the xmlxltn hash
s/$qqfindstring/$xmlxltn{$string}/gs;
}
# now, let's split it into the fields...
( $junk, $fnum, $fnote ) =
split( /\\/, $_, 3 );
$fnote =~ s/^ *//;
#print "RAW $fnum: $fnraw\nNOTE: $fnote\n\n";
# insert into the footnotes table...
$dbh->do(
"INSERT INTO footnotes (id, fnum, fnote, rawtext)
VALUES (?, ?, ?, ?)",
undef, $fnid, $fnum, $fnote,
$fnraw
);
}
}
}
#end of footnotes
###################
# reset the record delimiter for the records files
#$/ = "\x21\x0d\x0a\x21\x0d\x0a";
# \n\r!\n\r
$/ = "\x0d\x0a\x21\x0d\x0a";
# begin records
# begin while loop
INPUT: while ( $inline = <> ) {
# continuation...
# let's render the footnotes harmless for now
while (
$inline =~ s/\@f\\([^\\]*)\\/\@f<$1>/g )
{
#print "FOOTNOTES! $1\n";
}
# similarly, interpolation bubbles
while ( $inline =~ s/\@i\\/\@i/g ) {
}
# if slashbang happens on a line other than @r or @m, get the next line,
too
if ( $inline =~
/(?;
print "YEEHA2 $continued: $inline\n";
redo INPUT;
}
$inline =~ s/\x0d//gs;
if ( $inline =~ /\@w.*\@w/ ) {
$chimera = "CHIMERA: Multiple headers\n";
}
else { $chimera = ""; }
print TESTFILE
"$i: $chimera<$inline>\nENDOFRECORD\n\n";
# check to see whether there's an @h in this record; if so, initialize the
record;
# handle the previous record; and carry on...
# changed to @w: all records have it
# if ($inline =~ /\@[w]/) {
print LOGFILE "Hey, it's a new record!\n";
print LOGFILE "START:$inline:END:\n";
if ($text) {
&putout;
} # whatever the output routine is..
# }
# carry on with processing this chunk
$i++
; # count the chunks... each chunk is probably (?) a paragraph
$paranum =
$i; # we count each paragraph, just in case
$rawtext .= $inline;
print LOGFILE
"Chunk $i / $continued\n<<\n$inline\n>>\n";
# accents etc. -- combining elements -- should get a complete list from
REED
# in the meantime, compile a hash of unique accents, and then put them out
at
# the end of the run -- note which ones aren't xlated...
if ( $inline =~ /\@([.^,:"\?#].)/g ) {
$unxlt{$1} =
"Another untranslated thingy";
print "Hey! A match: $1\n";
print LOGFILE "UNXLT: $1\n";
}
# we should translate the open/close pairs first, then the bangslashes,
# if there are any left at that point
$inline =~ s/\\*!\n$/\n/gs;
# the easy translations
foreach $string ( keys %xmlxltn ) {
# some fiddling here to see what the correct quoting mechanism is
#print LOGFILE "$string: $xmlxltn{$string}\n";
# turns out we do want doublequoting of the keys
$qqfindstring = qq{$string};
# and we want to escape trailing slashes in the keys
$qqfindstring =~ s/\\$/\\\\/;
# replace all occurrences of the quoted, escaped string with its
# value in the xmlxltn hash
$inline =~
s/$qqfindstring/$xmlxltn{$string}/gs;
}
# fix those html comments
$inline =~ s/<--//g
)
{
$notetype = $1;
$notenum = $2;
# insert into the footnotes table... this is what we do for Frye...
$dbh->do(
"INSERT INTO footnotes (ref, notenumber, footnotenumber, chapter)
VALUES (?, ?, ?, ?)",
undef, $ref, $para, $footnotenumber,
$notebook
);
}
# Not in IOC; used in Wales, though, I think
#'@x\...@x \' -> '',
# a sort of sub-sub-heading delimits the line containing names of court
personnel for ecclesiastical court cases
# sometimes there're comments, descriptions, notes to Gord in the files:
[[ like this ]]
# and sometimes there are no brackets delimiting them... manual work.
# let's lop off that trailing
\n
\n
$inline =~ s/
\n\n*$//g;
$text .= $inline;
$text .= "\n";
} # end of the while loop
# final putout & some logging
&putout if $text;
print LOGFILE "\nTOTAL CHUNKS: $i / $continued\n";
print LOGFILE "\nUNTRANSLATED ACCENTS:\n";
foreach $accent ( keys %unxlt ) {
print LOGFILE "$accent\n";
}
# daterange procedure
sub daterange ($$) {
my ( $start, $end ) = @_;
@dateout = split //, $start;
@outend = split //, $end;
$offset = 0 - scalar(@outend);
splice(
@dateout, $offset,
scalar(@outend), @outend
);
$outdate = join '', @dateout;
return $outdate;
}
# the output/insertion procedure
sub putout () {
# wrap the appropriate markup around the critter
$htmlheader = "
REED
";
$htmltail = "
";
$xmlheader = "
";
$xmltail = "
";
# let's not top & tail for now
#$text = "$xmlheader$text$xmltail";
$dbh->do(
"INSERT INTO reed2 (id, paranum, text, rawtext, volume, volnum, editor,
description, repository, docdate, foliation, edsubhead, sourcetype)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
undef, $id, $paranum, $text, $rawtext,
$volume, $volnum, $editor, $description,
$repository, $docdate, $foliation,
$edsubhead, $docsrctype
);
#print LOGFILE "WTF?\n\nX: $id\nX: $paranum\nX: $text\nX: $rawtext\nX:
$volume\nX: $volnum\nX: $editor\nX: $description\nX: $repository\nX:
$docdate\n\n";
print LOGFILE
"WTF?\nDES: $description\nREPO: $repository\n\n";
# now, clear things out for the next record
$rawtext = $text = $repository =
$description = $docdate = "";
print LOGFILE
"Processed $i\n<<\n$inline\n>>\n";
}
# copyright 2009 Records of Early English Drama