#!/usr/bin/perl # papyrus - a perl script to parse REED records & chunk them into a database # some additional translation of typesetting codes into xhtml is done use Date::Calc; open LOGFILE, ">logfile"; open TESTFILE, ">testfile"; open WTF, ">wtffile"; # DBI initializations to point us at the correct datasource $driver = "mysql"; $database = "reed"; $hostname = "localhost"; $dbuser = "reed"; $dbpwd = ""; # enter password here use DBI; $dsn = "DBI:$driver:database=$database;host=$hostname"; $dbh = DBI->connect( $dsn, $dbuser, $dbpwd ); # the doccodes query $dch = $dbh->prepare( "select description, repository, location from doccodes where code like ? group by code" ); # some volume initialization # $id, $paranum, $volume, $volnum, $editor, $text, $rawtext); $volume = "Inns of Court"; $volnum = 1; $editor = "Alan H. Nelson"; # where are the footnotes? make a quick array of the files containing them #@fnfiles = ("IOCFootnotes1(start-1570).txt", "IOCFootnotes2(1570-1600).txt", "IOCFootnotes3(1600-end).txt"); @fnfiles = ( "/stmary/shared/epub/reed/workspace/test/junedata/IOCFN.txt" ); # set this to turn off/on footnotes handling $footnotesok = 1; # start these off blank $rawtext = $text = ""; # make the reed documents table $mkreed2 = "CREATE TABLE reed2 ( id int(8) NOT NULL auto_increment, paranum int(11) NOT NULL default 0, text longtext, rawtext longtext, docdate varchar(255) NOT NULL , description varchar(255) NOT NULL, repository varchar(255) NOT NULL, foliation varchar(255) NOT NULL, edsubhead longtext, sourcetype varchar(255) NOT NULL, volume varchar(255) NOT NULL , volnum tinyint(4) NOT NULL default 0, editor varchar(255) NOT NULL , doctype varchar(255) NOT NULL , section varchar(255) NOT NULL , docname varchar(255) NOT NULL , docpage varchar(255) NOT NULL , PRIMARY KEY (id) ) TYPE=MyISAM;"; ## make the footnotes table $mkfn = "create table footnotes ( id int not null, fnum varchar(5), fnote longtext, rawtext longtext, primary key (id), key (fnum) ) TYPE=MyISAM;"; # make the doccodes table; we don't usually do this, # since it should remain static, more or less. For # the record. $mkdc = "DROP TABLE IF EXISTS `reed`.`doccodes`; CREATE TABLE `reed`.`doccodes` ( `code` varchar(8) NOT NULL default '', `description` varchar(255) NOT NULL default '', `repository` varchar(255) NOT NULL default '', `loccode` varchar(12) NOT NULL default '', `location` varchar(45) NOT NULL default '', `sortorder` varchar(45) NOT NULL default '', `id` int(10) unsigned NOT NULL auto_increment, PRIMARY KEY (`id`,`sortorder`,`code`), KEY `loccode` (`loccode`) ) TYPE=MyISAM;"; # note that this clear-out is redundant, since we DROP TABLE IF EXISTS above... # clear out the past version of the reed table (and notes tables here too) # & start fresh $dbh->do("drop table reed2;"); $dbh->do("drop table footnotes;"); $dbh->do($mkreed2); $dbh->do($mkfn); # xlation hash # 10 Mar 09 # some/all of the begin & end pairs should be removed from the hash, and # treated as grouping replacements. # the hash xltn should be part of a larger translation subroutine, which handles # both the elements & the structural markup # XML TEI-Lite xlation hash %xmlxltn = ( '\.\.\.', '…', # ELLIPSIS '{', '', # italics ON '}', '', # italics OFF #'--', '–', # EN DASH -- do this afterwards, though -- comments complicate things '@a\\', '⌈' , # LEFT CEILING beginning of interlineation above the line '@a \\', '⌉' , # RIGHT CEILING end of interlineation above the line '@ ae', 'æ', # LATIN SMALL LETTER AE '@ AE', 'Æ', # LATIN CAPITAL LETTER AE '@b\\', '⌊' , # LEFT FLOOR beginning of interlineation below the line () '@b \\', '⌋' , # RIGHT FLOOR end of interlineation below the line () '@C', 'ℂ' , # DOUBLE-STRUCK CAPITAL C (that's not right...) code for a capitulum () '@d', 'ð', # code for an eth () '@e\\', '', # begin boldface '@e \\', '', # return to roman '@g\\', '', # begin greek '@g \\', '', # return to roman #'@i\\' => '°', # interpolation "bubble", naively using degree character '@i' => '°' , # interpolation "bubble", naively using degree character '@j\\', '' , # begin bold italics '@j \\', '', # return to roman '@k\\', '', # begin small caps '@k \\', '', # return to roman '@m\\', '', # begins centred text '@m \\', '', # ends centred text '@n', 'ŋ' , # LATIN SMALL LETTER ENG Welsh .ng. (looks like combined .nj.) () '@N', 'Ŋ' , # LATIN CAPITAL LETTER ENG Welsh .NG. (looks like combined .NJ.) () '@oe', 'œ' , # LATIN SMALL LIGATURE OE oe diphthong in ligature () '@OE', 'Œ' , # LATIN CAPITAL LIGATURE OE diphthong in ligature () '@P', '¶', # PILCROW SIGN paragraphus sign (¶) '@q\\', '' , # begin italic small caps '@q \\', '' , # return to roman N.B. everything within the @q pair must be typed in upper case characters or the conversion won’t work '@s\\', '' , # begins superscription '@s \\', '', # ends superscription '@th', 'þ' , # LATIN SMALL LETTER THORN code for a thorn () '@TH', 'Þ' , # LATIN CAPITAL LETTER THORN code for capital thorn () '@v', 'EFD;' , # LATIN SMALL LETTER MIDDLE-WELSH V; CAP=1EFC Welsh .special v. -- proposed UCS entity; no codepoint yet; MUFI entitiy 2.0a 1EFC, 1EFD '@y', 'ƿ', # LATIN LETTER WYNN wynn () '@z', 'ȝ' , # LATIN SMALL LETTER YOGH code for yogh () '@Z', 'Ȝ' , # LATIN CAPITAL LETTER YOGH code for capital yogh () '@!', '!' , # EXCLAMATION MARK exclamation point (!) '\^', '⁁' , # CARET INSERTION POINT caret symbol preceding interlineation '@\$', '£', # POUND SIGN pound sign (£) '@;', '⁏' , # REVERSED SEMICOLON inverted semicolon () #'@\[', '<-- FLUSHRIGHT -->', # ALIGNMENT info -- tab (used in accounts before flush-right sum) '@\*', '·' , # MIDDLE DOT raised period, like a semi-colon without the comma part '@\%', '§', # SECTION SIGN section symbol () '@u', '<-- HALFPICASPACE --> ' , # half-pica space # probably don't need to have this, since it's structural '@p', '<-- PICASPACE --> ', # pica space #'\!', '', # hard carriage return on the typesetter '@\?c', 'ç', # LATIN SMALL LETTER C WITH CEDILLA '@"', 'ñ', # LATIN SMALL LETTER N WITH CEDILLA '@\^a', 'â' # LATIN SMALL LETTER A WITH CIRCUMFLEX ); # end of xmlxltn hash ##################### #begin footnotes if ($footnotesok) { # note: @f\#\ and @i\ are fiddled in the main text; I don't think they're used in the footnotes foreach $fnfile (@fnfiles) { open( FOOTNOTES, "< $fnfile" ) or die "Couldn't open that; sorry: $!\n"; print "Opening $fnfile\n"; while () { chomp; next if /^--/; $fnid++; $fnraw = $_; # the easy translations foreach $string ( keys %xmlxltn ) { # turns out we do want doublequoting of the keys $qqfindstring = qq{$string}; # and we want to escape trailing slashes in the keys $qqfindstring =~ s/\\$/\\\\/; # replace all occurrences of the quoted, escaped string with its # value in the xmlxltn hash s/$qqfindstring/$xmlxltn{$string}/gs; } # now, let's split it into the fields... ( $junk, $fnum, $fnote ) = split( /\\/, $_, 3 ); $fnote =~ s/^ *//; #print "RAW $fnum: $fnraw\nNOTE: $fnote\n\n"; # insert into the footnotes table... $dbh->do( "INSERT INTO footnotes (id, fnum, fnote, rawtext) VALUES (?, ?, ?, ?)", undef, $fnid, $fnum, $fnote, $fnraw ); } } } #end of footnotes ################### # reset the record delimiter for the records files #$/ = "\x21\x0d\x0a\x21\x0d\x0a"; # \n\r!\n\r $/ = "\x0d\x0a\x21\x0d\x0a"; # begin records # begin while loop INPUT: while ( $inline = <> ) { # continuation... # let's render the footnotes harmless for now while ( $inline =~ s/\@f\\([^\\]*)\\/\@f<$1>/g ) { #print "FOOTNOTES! $1\n"; } # similarly, interpolation bubbles while ( $inline =~ s/\@i\\/\@i/g ) { } # if slashbang happens on a line other than @r or @m, get the next line, too if ( $inline =~ /(?; print "YEEHA2 $continued: $inline\n"; redo INPUT; } $inline =~ s/\x0d//gs; if ( $inline =~ /\@w.*\@w/ ) { $chimera = "CHIMERA: Multiple headers\n"; } else { $chimera = ""; } print TESTFILE "$i: $chimera<$inline>\nENDOFRECORD\n\n"; # check to see whether there's an @h in this record; if so, initialize the record; # handle the previous record; and carry on... # changed to @w: all records have it # if ($inline =~ /\@[w]/) { print LOGFILE "Hey, it's a new record!\n"; print LOGFILE "START:$inline:END:\n"; if ($text) { &putout; } # whatever the output routine is.. # } # carry on with processing this chunk $i++ ; # count the chunks... each chunk is probably (?) a paragraph $paranum = $i; # we count each paragraph, just in case $rawtext .= $inline; print LOGFILE "Chunk $i / $continued\n<<\n$inline\n>>\n"; # accents etc. -- combining elements -- should get a complete list from REED # in the meantime, compile a hash of unique accents, and then put them out at # the end of the run -- note which ones aren't xlated... if ( $inline =~ /\@([.^,:"\?#].)/g ) { $unxlt{$1} = "Another untranslated thingy"; print "Hey! A match: $1\n"; print LOGFILE "UNXLT: $1\n"; } # we should translate the open/close pairs first, then the bangslashes, # if there are any left at that point $inline =~ s/\\*!\n$/\n/gs; # the easy translations foreach $string ( keys %xmlxltn ) { # some fiddling here to see what the correct quoting mechanism is #print LOGFILE "$string: $xmlxltn{$string}\n"; # turns out we do want doublequoting of the keys $qqfindstring = qq{$string}; # and we want to escape trailing slashes in the keys $qqfindstring =~ s/\\$/\\\\/; # replace all occurrences of the quoted, escaped string with its # value in the xmlxltn hash $inline =~ s/$qqfindstring/$xmlxltn{$string}/gs; } # fix those html comments $inline =~ s/<--//g ) { $notetype = $1; $notenum = $2; # insert into the footnotes table... this is what we do for Frye... $dbh->do( "INSERT INTO footnotes (ref, notenumber, footnotenumber, chapter) VALUES (?, ?, ?, ?)", undef, $ref, $para, $footnotenumber, $notebook ); } # Not in IOC; used in Wales, though, I think #'@x\...@x \' -> '', # a sort of sub-sub-heading delimits the line containing names of court personnel for ecclesiastical court cases # sometimes there're comments, descriptions, notes to Gord in the files: [[ like this ]] # and sometimes there are no brackets delimiting them... manual work. # let's lop off that trailing
\n
\n $inline =~ s/
\n\n*$//g; $text .= $inline; $text .= "\n"; } # end of the while loop # final putout & some logging &putout if $text; print LOGFILE "\nTOTAL CHUNKS: $i / $continued\n"; print LOGFILE "\nUNTRANSLATED ACCENTS:\n"; foreach $accent ( keys %unxlt ) { print LOGFILE "$accent\n"; } # daterange procedure sub daterange ($$) { my ( $start, $end ) = @_; @dateout = split //, $start; @outend = split //, $end; $offset = 0 - scalar(@outend); splice( @dateout, $offset, scalar(@outend), @outend ); $outdate = join '', @dateout; return $outdate; } # the output/insertion procedure sub putout () { # wrap the appropriate markup around the critter $htmlheader = " REED "; $htmltail = " "; $xmlheader = " "; $xmltail = " "; # let's not top & tail for now #$text = "$xmlheader$text$xmltail"; $dbh->do( "INSERT INTO reed2 (id, paranum, text, rawtext, volume, volnum, editor, description, repository, docdate, foliation, edsubhead, sourcetype) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", undef, $id, $paranum, $text, $rawtext, $volume, $volnum, $editor, $description, $repository, $docdate, $foliation, $edsubhead, $docsrctype ); #print LOGFILE "WTF?\n\nX: $id\nX: $paranum\nX: $text\nX: $rawtext\nX: $volume\nX: $volnum\nX: $editor\nX: $description\nX: $repository\nX: $docdate\n\n"; print LOGFILE "WTF?\nDES: $description\nREPO: $repository\n\n"; # now, clear things out for the next record $rawtext = $text = $repository = $description = $docdate = ""; print LOGFILE "Processed $i\n<<\n$inline\n>>\n"; } # copyright 2009 Records of Early English Drama