From: gmc Date: Wed, 6 Apr 2011 22:06:42 +0000 (+0000) Subject: install command-line MARC import tools in @prefix@/bin X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=489cd05e2738d5cfc7bb5524428c8c3314883c5d;p=evergreen%2Fbjwebb.git install command-line MARC import tools in @prefix@/bin (corrected version after dealing with apparent git stash breakage) No longer need to keep the source tree around to use marc2are.pl, marc2sre.pl, marc2bre.pl, and parallel_pg_loader.pl. Signed-off-by: Galen Charlton git-svn-id: svn://svn.open-ils.org/ILS/trunk@20010 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- diff --git a/Open-ILS/src/Makefile.am b/Open-ILS/src/Makefile.am index 7988b53eb..e87bbd142 100644 --- a/Open-ILS/src/Makefile.am +++ b/Open-ILS/src/Makefile.am @@ -138,7 +138,7 @@ if BUILDEGJAVA OILSJAVA_DIR = java endif -bin_SCRIPTS = $(core_scripts) $(reporter_scripts) $(installautojs) @srcdir@/extras/eg_config @srcdir@/extras/fast-extract +bin_SCRIPTS = $(core_scripts) $(reporter_scripts) $(installautojs) @srcdir@/extras/eg_config @srcdir@/extras/fast-extract @srcdir@/extras/import/marc2are.pl @srcdir@/extras/import/marc2bre.pl @srcdir@/extras/import/marc2sre.pl @srcdir@/extras/import/parallel_pg_loader.pl data_DATA = $(core_data) $(reporter_data) # Take care of which subdirectories to build, and which extra files to include in a distribution. diff --git a/Open-ILS/src/extras/import/marc2are.pl b/Open-ILS/src/extras/import/marc2are.pl deleted file mode 100755 index 1eb86d43c..000000000 --- a/Open-ILS/src/extras/import/marc2are.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use lib '/openils/lib/perl5/'; - -use OpenSRF::System; -use OpenSRF::Application; -use OpenSRF::EX qw/:try/; -use OpenSRF::AppSession; -use OpenSRF::MultiSession; -use OpenSRF::Utils::SettingsClient; -use OpenILS::Application::AppUtils; -use OpenILS::Utils::Fieldmapper; -use Digest::MD5 qw/md5_hex/; -use OpenSRF::Utils::JSON; -use Data::Dumper; -use Unicode::Normalize; - -use Time::HiRes qw/time/; -use Getopt::Long; -use MARC::Batch; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -use MARC::Charset; - -MARC::Charset->ignore_errors(1); - -my ($count, $user, $password, $config, $marctype, $keyfile, @files, $quiet) = - (1, 'admin', 'open-ils', '/openils/conf/opensrf_core.xml', 'USMARC'); - -GetOptions( - 'startid=i' => \$count, - 'user=s' => \$user, - 'marctype=s' => \$marctype, - 'password=s' => \$password, - 'config=s' => \$config, - 'file=s' => \@files, - 'quiet' => \$quiet, -); - -@files = @ARGV if (!@files); - -my @ses; -my @req; -my %processing_cache; - -OpenSRF::System->bootstrap_client( config_file => $config ); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); - -$user = OpenILS::Application::AppUtils->check_user_session( login($user,$password) )->id; - -select STDERR; $| = 1; -select STDOUT; $| = 1; - -my $batch = new MARC::Batch ( $marctype, @files ); -$batch->strict_off(); -$batch->warnings_off(); - -my $starttime = time; -my $rec; -while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { - next if ($rec == -1); - my $id = $count; - - (my $xml = $rec->as_xml_record()) =~ s/\n//sog; - $xml =~ s/^<\?xml.+\?\s*>//go; - $xml =~ s/>\s+entityize($xml); - $xml =~ s/[\x00-\x1f]//go; - - my $bib = new Fieldmapper::authority::record_entry; - $bib->id($id); - $bib->active('t'); - $bib->deleted('f'); - $bib->marc($xml); - $bib->creator($user); - $bib->create_date('now'); - $bib->editor($user); - $bib->edit_date('now'); - $bib->last_xact_id('IMPORT-'.$starttime); - - print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; - - $count++; - - if (!$quiet && !($count % 20)) { - print STDERR "\r$count\t". $count / (time - $starttime); - } -} - -sub login { - my( $username, $password, $type ) = @_; - - $type |= "staff"; - - my $seed = OpenILS::Application::AppUtils->simplereq( - 'open-ils.auth', - 'open-ils.auth.authenticate.init', - $username - ); - - die("No auth seed. Couldn't talk to the auth server") unless $seed; - - my $response = OpenILS::Application::AppUtils->simplereq( - 'open-ils.auth', - 'open-ils.auth.authenticate.complete', - { username => $username, - password => md5_hex($seed . md5_hex($password)), - type => $type }); - - die("No auth response returned on login.") unless $response; - - my $authtime = $response->{payload}->{authtime}; - my $authtoken = $response->{payload}->{authtoken}; - - die("Login failed for user $username!") unless $authtoken; - - return $authtoken; -} - diff --git a/Open-ILS/src/extras/import/marc2are.pl.in b/Open-ILS/src/extras/import/marc2are.pl.in new file mode 100755 index 000000000..d6a4c12c0 --- /dev/null +++ b/Open-ILS/src/extras/import/marc2are.pl.in @@ -0,0 +1,119 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use OpenSRF::System; +use OpenSRF::Application; +use OpenSRF::EX qw/:try/; +use OpenSRF::AppSession; +use OpenSRF::MultiSession; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Application::AppUtils; +use OpenILS::Utils::Fieldmapper; +use Digest::MD5 qw/md5_hex/; +use OpenSRF::Utils::JSON; +use Data::Dumper; +use Unicode::Normalize; + +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; + +MARC::Charset->ignore_errors(1); + +my ($count, $user, $password, $config, $marctype, $keyfile, @files, $quiet) = + (1, 'admin', 'open-ils', '@sysconfdir@/opensrf_core.xml', 'USMARC'); + +GetOptions( + 'startid=i' => \$count, + 'user=s' => \$user, + 'marctype=s' => \$marctype, + 'password=s' => \$password, + 'config=s' => \$config, + 'file=s' => \@files, + 'quiet' => \$quiet, +); + +@files = @ARGV if (!@files); + +my @ses; +my @req; +my %processing_cache; + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +$user = OpenILS::Application::AppUtils->check_user_session( login($user,$password) )->id; + +select STDERR; $| = 1; +select STDOUT; $| = 1; + +my $batch = new MARC::Batch ( $marctype, @files ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + my $id = $count; + + (my $xml = $rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + my $bib = new Fieldmapper::authority::record_entry; + $bib->id($id); + $bib->active('t'); + $bib->deleted('f'); + $bib->marc($xml); + $bib->creator($user); + $bib->create_date('now'); + $bib->editor($user); + $bib->edit_date('now'); + $bib->last_xact_id('IMPORT-'.$starttime); + + print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; + + $count++; + + if (!$quiet && !($count % 20)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } +} + +sub login { + my( $username, $password, $type ) = @_; + + $type |= "staff"; + + my $seed = OpenILS::Application::AppUtils->simplereq( + 'open-ils.auth', + 'open-ils.auth.authenticate.init', + $username + ); + + die("No auth seed. Couldn't talk to the auth server") unless $seed; + + my $response = OpenILS::Application::AppUtils->simplereq( + 'open-ils.auth', + 'open-ils.auth.authenticate.complete', + { username => $username, + password => md5_hex($seed . md5_hex($password)), + type => $type }); + + die("No auth response returned on login.") unless $response; + + my $authtime = $response->{payload}->{authtime}; + my $authtoken = $response->{payload}->{authtoken}; + + die("Login failed for user $username!") unless $authtoken; + + return $authtoken; +} + diff --git a/Open-ILS/src/extras/import/marc2bre.pl b/Open-ILS/src/extras/import/marc2bre.pl deleted file mode 100755 index e5c9604dc..000000000 --- a/Open-ILS/src/extras/import/marc2bre.pl +++ /dev/null @@ -1,398 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -#use lib '/openils/lib/perl5/'; - -use Error qw/:try/; -use OpenILS::Utils::Fieldmapper; -use Digest::MD5 qw/md5_hex/; -use OpenSRF::Utils::JSON; -use OpenILS::Application::AppUtils; -use Data::Dumper; -use Unicode::Normalize; -use Encode; - -use FileHandle; -use Time::HiRes qw/time/; -use Getopt::Long; -use MARC::Batch; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -use MARC::Charset; -use DBI; - -#MARC::Charset->ignore_errors(1); - -my ($id_field, $id_subfield, $recid, $user, $config, $idlfile, $marctype, $tcn_offset, $tcn_mapfile, $tcn_dumpfile, $used_id_file, $used_tcn_file, $enc, @files, @trash_fields, @req_fields, $use901, $quiet, $tcn_field, $tcn_subfield) = - ('', 'a', 0, 1, '/openils/conf/opensrf_core.xml', '/openils/conf/fm_IDL.xml', 'USMARC', 0); - -my ($db_driver, $db_host, $db_port, $db_name, $db_user, $db_pw) = - ('Pg', 'localhost', 5432, 'evergreen', 'postgres', 'postgres'); - -GetOptions( - 'marctype=s' => \$marctype, # format of MARC files being processed defaults to USMARC, often set to XML - 'startid=i' => \$recid, # id number to start with when auto-assigning id numbers, defaults to highest id in database + 1 - 'idfield=s' => \$id_field, # field containing the record's desired internal id, NOT tcn - 'idsubfield=s' => \$id_subfield, # subfield of above record id field - 'tcnfield=s' => \$tcn_field, # field containing the record's desired tcn, NOT the internal id - 'tcnsubfield=s' => \$tcn_subfield, # subfield of above record tcn field - 'tcnoffset=i' => \$tcn_offset, # optionally skip characters at beginning of supplied tcn (e.g. to remove '(Sirsi)') - 'user=s' => \$user, # set creator/editor values for records in database - 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset - 'keyfile=s' => \$tcn_mapfile, # DEPRECATED, use tcn_mapfile instead - 'tcn_mapfile=s' => \$tcn_mapfile, # external file which allows for matching specific record tcns to specific record ids, format = one id_number|tcn_number combo per line - 'tcnfile=s' => \$tcn_dumpfile, # DEPRECATED, use tcn_dumpfile instead - 'tcn_dumpfile=s' => \$tcn_dumpfile, # allows specification of a dumpfile for all used tcn values - 'config=s' => \$config, # location of OpenSRF core config file, defaults to /openils/conf/opensrf_core.xml - 'file=s' => \@files, # files to process (or you can simple list the files as unnamed arguments, i.e. @ARGV) - 'required_fields=s' => \@req_fields, # skip any records missing these fields - 'trash=s' => \@trash_fields, # fields to remove from all processed records - 'xml_idl=s' => \$idlfile, # location of XML IDL file, defaults to /openils/conf/fm_IDL.xml - 'dontuse=s' => \$used_id_file, # DEPRECATED, use used_id_file instead - 'used_id_file=s' => \$used_id_file, # external file which prevents id collisions by specifying ids already in use in the database, format = one id number per line - 'used_tcn_file=s' => \$used_tcn_file, # external file which prevents tcn collisions by specifying tcns already in use in the database, format = one tcn number per line - "db_driver=s" => \$db_driver, # database driver type, usually 'Pg' - "db_host=s" => \$db_host, # database hostname - "db_port=i" => \$db_port, # database port - "db_name=s" => \$db_name, # database name - "db_user=s" => \$db_user, # database username - "db_pw=s" => \$db_pw, # database password - 'use901' => \$use901, # use values from previously created 901 fields and skip all other processing - 'quiet' => \$quiet # do not output progress count -); - -@trash_fields = split(/,/,join(',',@trash_fields)); -@req_fields = split(/,/,join(',',@req_fields)); - -if ($enc) { - MARC::Charset->ignore_errors(1); - MARC::Charset->assume_encoding($enc); -} - -if (uc($marctype) eq 'XML') { - 'open'->use(':utf8'); -} else { - bytes->use(); -} - -@files = @ARGV if (!@files); - -my @ses; -my @req; -my %processing_cache; - -my $dsn = "dbi:$db_driver:host=$db_host;port=$db_port;dbname=$db_name"; - -if (!$recid) { - my $table = 'biblio_record_entry'; - $table = 'biblio.record_entry' if ($db_driver eq 'Pg'); - - my $dbh = DBI->connect($dsn,$db_user,$db_pw); - my $sth = $dbh->prepare("SELECT MAX(id) + 1 FROM $table"); - - $sth->execute; - $sth->bind_col(1, \$recid); - $sth->fetch; - $sth->finish; - $dbh->disconnect; - - # In a clean Evergreen schema, the maximum ID will be -1; but sequences - # have to start at 1, so handle the clean Evergreen schema situation - if ($recid == 0) { - $recid = 1; - } -} - -my %tcn_source_map = ( - a => 'Sirsi_Auto', - o => 'OCLC', - i => 'ISxN', - l => 'LCCN', - s => 'System', - g => 'Gutenberg', - z => 'Unknown', -); - -Fieldmapper->import(IDL => $idlfile); - -my %tcn_map; -if ($tcn_mapfile) { - open F, $tcn_mapfile or die "Couldn't open key file $tcn_mapfile"; - while () { - if ( /^(\d+)\|(\S+)/o ) { - $tcn_map{$1} = $2; - } - } - close(F); -} - -my %used_recids; -if ($used_id_file) { - open F, $used_id_file or die "Couldn't open used-id file $used_id_file"; - while () { - chomp; - s/^\s*//; - s/\s*$//; - $used_recids{$_} = 1; - } - close(F); -} - -my %used_tcns; -if ($used_tcn_file) { - open F, $used_tcn_file or die "Couldn't open used-tcn file $used_tcn_file"; - while () { - chomp; - s/^\s*//; - s/\s*$//; - $used_tcns{$_} = 1; - } - close(F); -} - -select STDERR; $| = 1; -select STDOUT; $| = 1; - -my $batch = new MARC::Batch ( $marctype, @files ); -$batch->strict_off(); -$batch->warnings_off(); - -my $starttime = time; -my $rec; -my $count = 0; -PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { - next if ($rec == -1); - - $count++; - - # Skip records that don't contain a required field (like '245', for example) - foreach my $req_field (@req_fields) { - if (!$rec->field("$req_field")) { - warn "\n!!! Record $count missing required field $req_field, skipping record.\n"; - next PROCESS; - } - } - - my $id; - my $tcn_value = ''; - my $tcn_source = ''; - # If $use901 is set, use it for the id, the tcn, and the tcn source without ANY further processing (i.e. no error checking) - if ($use901) { - $rec->delete_field($_) for ($rec->field(@trash_fields)); - $tcn_value = $rec->subfield('901' => 'a'); - $tcn_source = $rec->subfield('901' => 'b'); - $id = $rec->subfield('901' => 'c'); - } else { - # This section of code deals with the record's 'id', which is a system-level, numeric, internal identifier - # It is often convenient but not necessary to carry over the internal ids from your previous ILS, so here is where that happens - if ($id_field) { - my $field = $rec->field($id_field); - if ($field) { - if ($field->is_control_field) { - $id = $field->data; - } else { - $id = $field->subfield($id_subfield); - } - # ensure internal record ids are numeric only - $id =~ s/\D+//gso if $id; - } - - # catch problem ids - if (!$id) { - warn "\n!!! Record $count has missing or invalid id field $id_field, assigning new id.\n"; - $id = ''; - } elsif (exists $used_recids{$id}) { - warn "\n!!! Record $count has a duplicate id in field $id_field, assigning new id.\n"; - $id = ''; - } else { - $used_recids{$id} = 1; - } - } - - # id field not specified or found to be invalid, assign auto id - if (!$id) { - while (exists $used_recids{$recid}) { - $recid++; - } - $used_recids{$recid} = 1; - $id = $recid; - $recid++; - } - - # This section of code deals with the record's 'tcn', or title control number, which is a record-level, possibly alpha-numeric, sometimes user-supplied value - if ($tcn_field) { - if ($tcn_mapfile) { - if (my $tcn = $tcn_map{$id}) { - $rec->delete_field( $_ ) for ($rec->field($tcn_field)); - $rec->append_fields( MARC::Field->new( $tcn_field, '', '', $tcn_subfield, $tcn ) ); - } else { - warn "\n!!! ID $id not found in tcn_mapfile, skipping record.\n"; - $count++; - next; - } - } - - my $field = $rec->field($tcn_field); - if ($field) { - if ($field->is_control_field) { - $tcn_value = $field->data; - } else { - $tcn_value = $field->subfield($tcn_subfield); - } - # $tcn_offset is another Sirsi influence, as it will allow you to remove '(Sirsi)' - # from exported tcns, but was added more generically to perhaps support other use cases - if ($tcn_value) { - $tcn_value = substr($tcn_value, $tcn_offset); - } else { - $tcn_value = ''; - } - } - } - - # turn our id and tcn into a 901 field, and also create a tcn and/or figure out the tcn source - ($tcn_value, $tcn_source) = preprocess($rec, $tcn_value, $id); - # delete the old identifier and trash fields - $rec->delete_field($_) for ($rec->field('901', $tcn_field, $id_field, @trash_fields)); - } - - (my $xml = $rec->as_xml_record()) =~ s/\n//sog; - $xml =~ s/^<\?xml.+\?\s*>//go; - $xml =~ s/>\s+entityize($xml); - $xml =~ s/[\x00-\x1f]//go; - - my $bib = new Fieldmapper::biblio::record_entry; - $bib->id($id); - $bib->active('t'); - $bib->deleted('f'); - $bib->marc($xml); - $bib->creator($user); - $bib->create_date('now'); - $bib->editor($user); - $bib->edit_date('now'); - $bib->tcn_source($tcn_source); - $bib->tcn_value($tcn_value); - $bib->last_xact_id('IMPORT-'.$starttime); - - print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; - $used_tcns{$tcn_value} = 1; - - if (!$quiet && !($count % 50)) { - print STDERR "\r$count\t". $count / (time - $starttime); - } -} - -if ($tcn_dumpfile) { - open TCN_DUMPFILE, '>', $tcn_dumpfile; - print TCN_DUMPFILE "$_\n" for (keys %used_tcns); -} - - -sub preprocess { - my $rec = shift; - my $tcn_value = shift; - my $id = shift; - - my $tcn_source = ''; - # in the following code, $tcn_number represents the portion of the tcn following the source code-letter - my $tcn_number = ''; - my $warn = 0; - my $passed_tcn = ''; - - # this preprocess subroutine is optimized for Sirsi-created tcns, that is, those with a single letter - # followed by some digits (and maybe 'x' in older systems). If using user supplied tcns, try to identify - # the source here, otherwise set to 'z' ('Unknown') - if ($tcn_value =~ /([a-z])([0-9xX]+)/) { - $tcn_source = $1; - $tcn_number = $2; - } else { - $tcn_source = 'z'; - } - - # save and warn if a passed in TCN is replaced - if ($tcn_value && exists $used_tcns{$tcn_value}) { - $passed_tcn = $tcn_value; - $tcn_value = ''; - $tcn_number = ''; - $tcn_source = ''; - $warn = 1; - } - - # we didn't have a user supplied tcn, or it was a duplicate, so let's derive one from commonly unique record fields - if (!$tcn_value) { - my $f = $rec->field('001'); - $tcn_value = despace($f->data) if ($f); - } - - if (!$tcn_value || exists $used_tcns{$tcn_value}) { - my $f = $rec->field('000'); - if ($f) { - $tcn_number = despace($f->data); - $tcn_source = 'g'; # only Project Gutenberg seems to use this - $tcn_value = $tcn_source.$tcn_number; - } - } - - if (!$tcn_value || exists $used_tcns{$tcn_value}) { - my $f = $rec->field('020'); - if ($f) { - $tcn_number = despace($f->subfield('a')); - $tcn_source = 'i'; - $tcn_value = $tcn_source.$tcn_number; - } - } - - if (!$tcn_value || exists $used_tcns{$tcn_value}) { - my $f = $rec->field('022'); - if ($f) { - $tcn_number = despace($f->subfield('a')); - $tcn_source = 'i'; - $tcn_value = $tcn_source.$tcn_number; - } - } - - if (!$tcn_value || exists $used_tcns{$tcn_value}) { - my $f = $rec->field('010'); - if ($f) { - $tcn_number = despace($f->subfield('a')); - $tcn_source = 'l'; - $tcn_value = $tcn_source.$tcn_number; - } - } - - # special case to catch possibly passed in full OCLC numbers and those derived from the 001 field - if ($tcn_value =~ /^oc(m|n)(\d+)$/o) { - $tcn_source = 'o'; - $tcn_number = $2; - $tcn_value = $tcn_source.$tcn_number; - } - - if (!$tcn_value || exists $used_tcns{$tcn_value}) { - $tcn_source = 's'; - $tcn_number = $id; - $tcn_value = $tcn_source.$tcn_number; - $warn = 1 - } - - - # expand $tcn_source from code letter to full name - $tcn_source = do { $tcn_source_map{$tcn_source} || 'Unknown' }; - - if ($warn) { - warn "\n!!! TCN $passed_tcn is already in use, using TCN ($tcn_value) derived from $tcn_source ID.\n"; - } - - return ($tcn_value, $tcn_source); -} - -sub despace { - my $value = shift; - - # remove all leading/trailing spaces and trucate at first internal space if present - $value =~ s/\s*$//o; - $value =~ s/^\s*//o; - $value =~ s/^(\S+).*$/$1/o; - - return $value; -} diff --git a/Open-ILS/src/extras/import/marc2bre.pl.in b/Open-ILS/src/extras/import/marc2bre.pl.in new file mode 100755 index 000000000..d9de5c3f4 --- /dev/null +++ b/Open-ILS/src/extras/import/marc2bre.pl.in @@ -0,0 +1,396 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use Error qw/:try/; +use OpenILS::Utils::Fieldmapper; +use Digest::MD5 qw/md5_hex/; +use OpenSRF::Utils::JSON; +use OpenILS::Application::AppUtils; +use Data::Dumper; +use Unicode::Normalize; +use Encode; + +use FileHandle; +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; +use DBI; + +#MARC::Charset->ignore_errors(1); + +my ($id_field, $id_subfield, $recid, $user, $config, $idlfile, $marctype, $tcn_offset, $tcn_mapfile, $tcn_dumpfile, $used_id_file, $used_tcn_file, $enc, @files, @trash_fields, @req_fields, $use901, $quiet, $tcn_field, $tcn_subfield) = + ('', 'a', 0, 1, '@sysconfdir@/opensrf_core.xml', '@sysconfdir@/fm_IDL.xml', 'USMARC', 0); + +my ($db_driver, $db_host, $db_port, $db_name, $db_user, $db_pw) = + ('Pg', 'localhost', 5432, 'evergreen', 'postgres', 'postgres'); + +GetOptions( + 'marctype=s' => \$marctype, # format of MARC files being processed defaults to USMARC, often set to XML + 'startid=i' => \$recid, # id number to start with when auto-assigning id numbers, defaults to highest id in database + 1 + 'idfield=s' => \$id_field, # field containing the record's desired internal id, NOT tcn + 'idsubfield=s' => \$id_subfield, # subfield of above record id field + 'tcnfield=s' => \$tcn_field, # field containing the record's desired tcn, NOT the internal id + 'tcnsubfield=s' => \$tcn_subfield, # subfield of above record tcn field + 'tcnoffset=i' => \$tcn_offset, # optionally skip characters at beginning of supplied tcn (e.g. to remove '(Sirsi)') + 'user=s' => \$user, # set creator/editor values for records in database + 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset + 'keyfile=s' => \$tcn_mapfile, # DEPRECATED, use tcn_mapfile instead + 'tcn_mapfile=s' => \$tcn_mapfile, # external file which allows for matching specific record tcns to specific record ids, format = one id_number|tcn_number combo per line + 'tcnfile=s' => \$tcn_dumpfile, # DEPRECATED, use tcn_dumpfile instead + 'tcn_dumpfile=s' => \$tcn_dumpfile, # allows specification of a dumpfile for all used tcn values + 'config=s' => \$config, # location of OpenSRF core config file, defaults to @sysconfdir@/opensrf_core.xml + 'file=s' => \@files, # files to process (or you can simple list the files as unnamed arguments, i.e. @ARGV) + 'required_fields=s' => \@req_fields, # skip any records missing these fields + 'trash=s' => \@trash_fields, # fields to remove from all processed records + 'xml_idl=s' => \$idlfile, # location of XML IDL file, defaults to @sysconfdir@/fm_IDL.xml + 'dontuse=s' => \$used_id_file, # DEPRECATED, use used_id_file instead + 'used_id_file=s' => \$used_id_file, # external file which prevents id collisions by specifying ids already in use in the database, format = one id number per line + 'used_tcn_file=s' => \$used_tcn_file, # external file which prevents tcn collisions by specifying tcns already in use in the database, format = one tcn number per line + "db_driver=s" => \$db_driver, # database driver type, usually 'Pg' + "db_host=s" => \$db_host, # database hostname + "db_port=i" => \$db_port, # database port + "db_name=s" => \$db_name, # database name + "db_user=s" => \$db_user, # database username + "db_pw=s" => \$db_pw, # database password + 'use901' => \$use901, # use values from previously created 901 fields and skip all other processing + 'quiet' => \$quiet # do not output progress count +); + +@trash_fields = split(/,/,join(',',@trash_fields)); +@req_fields = split(/,/,join(',',@req_fields)); + +if ($enc) { + MARC::Charset->ignore_errors(1); + MARC::Charset->assume_encoding($enc); +} + +if (uc($marctype) eq 'XML') { + 'open'->use(':utf8'); +} else { + bytes->use(); +} + +@files = @ARGV if (!@files); + +my @ses; +my @req; +my %processing_cache; + +my $dsn = "dbi:$db_driver:host=$db_host;port=$db_port;dbname=$db_name"; + +if (!$recid) { + my $table = 'biblio_record_entry'; + $table = 'biblio.record_entry' if ($db_driver eq 'Pg'); + + my $dbh = DBI->connect($dsn,$db_user,$db_pw); + my $sth = $dbh->prepare("SELECT MAX(id) + 1 FROM $table"); + + $sth->execute; + $sth->bind_col(1, \$recid); + $sth->fetch; + $sth->finish; + $dbh->disconnect; + + # In a clean Evergreen schema, the maximum ID will be -1; but sequences + # have to start at 1, so handle the clean Evergreen schema situation + if ($recid == 0) { + $recid = 1; + } +} + +my %tcn_source_map = ( + a => 'Sirsi_Auto', + o => 'OCLC', + i => 'ISxN', + l => 'LCCN', + s => 'System', + g => 'Gutenberg', + z => 'Unknown', +); + +Fieldmapper->import(IDL => $idlfile); + +my %tcn_map; +if ($tcn_mapfile) { + open F, $tcn_mapfile or die "Couldn't open key file $tcn_mapfile"; + while () { + if ( /^(\d+)\|(\S+)/o ) { + $tcn_map{$1} = $2; + } + } + close(F); +} + +my %used_recids; +if ($used_id_file) { + open F, $used_id_file or die "Couldn't open used-id file $used_id_file"; + while () { + chomp; + s/^\s*//; + s/\s*$//; + $used_recids{$_} = 1; + } + close(F); +} + +my %used_tcns; +if ($used_tcn_file) { + open F, $used_tcn_file or die "Couldn't open used-tcn file $used_tcn_file"; + while () { + chomp; + s/^\s*//; + s/\s*$//; + $used_tcns{$_} = 1; + } + close(F); +} + +select STDERR; $| = 1; +select STDOUT; $| = 1; + +my $batch = new MARC::Batch ( $marctype, @files ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +my $count = 0; +PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + + $count++; + + # Skip records that don't contain a required field (like '245', for example) + foreach my $req_field (@req_fields) { + if (!$rec->field("$req_field")) { + warn "\n!!! Record $count missing required field $req_field, skipping record.\n"; + next PROCESS; + } + } + + my $id; + my $tcn_value = ''; + my $tcn_source = ''; + # If $use901 is set, use it for the id, the tcn, and the tcn source without ANY further processing (i.e. no error checking) + if ($use901) { + $rec->delete_field($_) for ($rec->field(@trash_fields)); + $tcn_value = $rec->subfield('901' => 'a'); + $tcn_source = $rec->subfield('901' => 'b'); + $id = $rec->subfield('901' => 'c'); + } else { + # This section of code deals with the record's 'id', which is a system-level, numeric, internal identifier + # It is often convenient but not necessary to carry over the internal ids from your previous ILS, so here is where that happens + if ($id_field) { + my $field = $rec->field($id_field); + if ($field) { + if ($field->is_control_field) { + $id = $field->data; + } else { + $id = $field->subfield($id_subfield); + } + # ensure internal record ids are numeric only + $id =~ s/\D+//gso if $id; + } + + # catch problem ids + if (!$id) { + warn "\n!!! Record $count has missing or invalid id field $id_field, assigning new id.\n"; + $id = ''; + } elsif (exists $used_recids{$id}) { + warn "\n!!! Record $count has a duplicate id in field $id_field, assigning new id.\n"; + $id = ''; + } else { + $used_recids{$id} = 1; + } + } + + # id field not specified or found to be invalid, assign auto id + if (!$id) { + while (exists $used_recids{$recid}) { + $recid++; + } + $used_recids{$recid} = 1; + $id = $recid; + $recid++; + } + + # This section of code deals with the record's 'tcn', or title control number, which is a record-level, possibly alpha-numeric, sometimes user-supplied value + if ($tcn_field) { + if ($tcn_mapfile) { + if (my $tcn = $tcn_map{$id}) { + $rec->delete_field( $_ ) for ($rec->field($tcn_field)); + $rec->append_fields( MARC::Field->new( $tcn_field, '', '', $tcn_subfield, $tcn ) ); + } else { + warn "\n!!! ID $id not found in tcn_mapfile, skipping record.\n"; + $count++; + next; + } + } + + my $field = $rec->field($tcn_field); + if ($field) { + if ($field->is_control_field) { + $tcn_value = $field->data; + } else { + $tcn_value = $field->subfield($tcn_subfield); + } + # $tcn_offset is another Sirsi influence, as it will allow you to remove '(Sirsi)' + # from exported tcns, but was added more generically to perhaps support other use cases + if ($tcn_value) { + $tcn_value = substr($tcn_value, $tcn_offset); + } else { + $tcn_value = ''; + } + } + } + + # turn our id and tcn into a 901 field, and also create a tcn and/or figure out the tcn source + ($tcn_value, $tcn_source) = preprocess($rec, $tcn_value, $id); + # delete the old identifier and trash fields + $rec->delete_field($_) for ($rec->field('901', $tcn_field, $id_field, @trash_fields)); + } + + (my $xml = $rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + my $bib = new Fieldmapper::biblio::record_entry; + $bib->id($id); + $bib->active('t'); + $bib->deleted('f'); + $bib->marc($xml); + $bib->creator($user); + $bib->create_date('now'); + $bib->editor($user); + $bib->edit_date('now'); + $bib->tcn_source($tcn_source); + $bib->tcn_value($tcn_value); + $bib->last_xact_id('IMPORT-'.$starttime); + + print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; + $used_tcns{$tcn_value} = 1; + + if (!$quiet && !($count % 50)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } +} + +if ($tcn_dumpfile) { + open TCN_DUMPFILE, '>', $tcn_dumpfile; + print TCN_DUMPFILE "$_\n" for (keys %used_tcns); +} + + +sub preprocess { + my $rec = shift; + my $tcn_value = shift; + my $id = shift; + + my $tcn_source = ''; + # in the following code, $tcn_number represents the portion of the tcn following the source code-letter + my $tcn_number = ''; + my $warn = 0; + my $passed_tcn = ''; + + # this preprocess subroutine is optimized for Sirsi-created tcns, that is, those with a single letter + # followed by some digits (and maybe 'x' in older systems). If using user supplied tcns, try to identify + # the source here, otherwise set to 'z' ('Unknown') + if ($tcn_value =~ /([a-z])([0-9xX]+)/) { + $tcn_source = $1; + $tcn_number = $2; + } else { + $tcn_source = 'z'; + } + + # save and warn if a passed in TCN is replaced + if ($tcn_value && exists $used_tcns{$tcn_value}) { + $passed_tcn = $tcn_value; + $tcn_value = ''; + $tcn_number = ''; + $tcn_source = ''; + $warn = 1; + } + + # we didn't have a user supplied tcn, or it was a duplicate, so let's derive one from commonly unique record fields + if (!$tcn_value) { + my $f = $rec->field('001'); + $tcn_value = despace($f->data) if ($f); + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('000'); + if ($f) { + $tcn_number = despace($f->data); + $tcn_source = 'g'; # only Project Gutenberg seems to use this + $tcn_value = $tcn_source.$tcn_number; + } + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('020'); + if ($f) { + $tcn_number = despace($f->subfield('a')); + $tcn_source = 'i'; + $tcn_value = $tcn_source.$tcn_number; + } + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('022'); + if ($f) { + $tcn_number = despace($f->subfield('a')); + $tcn_source = 'i'; + $tcn_value = $tcn_source.$tcn_number; + } + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + my $f = $rec->field('010'); + if ($f) { + $tcn_number = despace($f->subfield('a')); + $tcn_source = 'l'; + $tcn_value = $tcn_source.$tcn_number; + } + } + + # special case to catch possibly passed in full OCLC numbers and those derived from the 001 field + if ($tcn_value =~ /^oc(m|n)(\d+)$/o) { + $tcn_source = 'o'; + $tcn_number = $2; + $tcn_value = $tcn_source.$tcn_number; + } + + if (!$tcn_value || exists $used_tcns{$tcn_value}) { + $tcn_source = 's'; + $tcn_number = $id; + $tcn_value = $tcn_source.$tcn_number; + $warn = 1 + } + + + # expand $tcn_source from code letter to full name + $tcn_source = do { $tcn_source_map{$tcn_source} || 'Unknown' }; + + if ($warn) { + warn "\n!!! TCN $passed_tcn is already in use, using TCN ($tcn_value) derived from $tcn_source ID.\n"; + } + + return ($tcn_value, $tcn_source); +} + +sub despace { + my $value = shift; + + # remove all leading/trailing spaces and trucate at first internal space if present + $value =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/^(\S+).*$/$1/o; + + return $value; +} diff --git a/Open-ILS/src/extras/import/marc2sre.pl b/Open-ILS/src/extras/import/marc2sre.pl deleted file mode 100755 index 2ac551609..000000000 --- a/Open-ILS/src/extras/import/marc2sre.pl +++ /dev/null @@ -1,333 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use OpenSRF::System; -use OpenSRF::EX qw/:try/; -use OpenSRF::Utils::SettingsClient; -use OpenILS::Application::AppUtils; -use OpenILS::Event; -use OpenILS::Utils::Fieldmapper; -use OpenSRF::Utils::JSON; -use Unicode::Normalize; - -use Time::HiRes qw/time/; -use Getopt::Long; -use MARC::Batch; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -use MARC::Charset; -use Pod::Usage; - -MARC::Charset->ignore_errors(1); - -# Command line options, with applicable defaults -my ($idsubfield, $bibfield, $bibsubfield, @files, $libmap, $quiet, $help); -my $idfield = '004'; -my $count = 1; -my $user = 'admin'; -my $config = '/openils/conf/opensrf_core.xml'; -my $marctype = 'USMARC'; - -my $parse_options = GetOptions( - 'idfield=s' => \$idfield, - 'idsubfield=s' => \$idsubfield, - 'bibfield=s'=> \$bibfield, - 'bibsubfield=s'=> \$bibsubfield, - 'startid=i'=> \$count, - 'user=s' => \$user, - 'config=s' => \$config, - 'marctype=s' => \$marctype, - 'file=s' => \@files, - 'libmap=s' => \$libmap, - 'quiet' => \$quiet, - 'help' => \$help, -); - -if (!$parse_options or $help) { - pod2usage(0); -} - -@files = @ARGV if (!@files); - -my $U = 'OpenILS::Application::AppUtils'; -my @ses; -my @req; -my %processing_cache; -my $lib_id_map; -if ($libmap) { - $lib_id_map = map_libraries_to_ID($libmap); -} - -OpenSRF::System->bootstrap_client( config_file => $config ); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); - -my ($result, $evt) = get_user_id($user); -if ($evt || !$result->id) { - print("Could not retrieve user with user name '$user'\n"); - exit(0); -} - -$user = $result->id; - -select STDERR; $| = 1; -select STDOUT; $| = 1; - -my $batch = new MARC::Batch ( $marctype, @files ); -$batch->strict_off(); -$batch->warnings_off(); - -my $starttime = time; -my $rec; -while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { - next if ($rec == -1); - my $id = $count; - my $record_field; - if ($idsubfield) { - $record_field = $rec->field($idfield, $idsubfield); - } else { - $record_field = $rec->field($idfield); - } - - # Start by just using the counter as the record ID - my $record = $count; - - # If we have identified a location for the bib record ID, grab that value - if ($record_field) { - $record = $record_field->data; - } - - # If we didn't get a bib record ID, skip and move on to the next MFHD record - if (!$record) { - print STDERR "Could not find a bibliographic record ID link for record $count\n"; - next; - } - - # If we have been given bibfield / bibsubfield values, use those to find - # a matching bib record for $record and use _that_ as our record instead - if ($bibfield) { - my ($result, $evt) = map_id_to_bib($record); - if ($evt || !$result || !$result->record) { - print STDERR "Could not find matching bibliographic record for record $count\n"; - next; - } - $record = $result->record; - } else { - # Strip the identifier down to a usable integer - $record =~ s/^.*?(\d+).*?$/$1/o; - } - - (my $xml = $rec->as_xml_record()) =~ s/\n//sog; - $xml =~ s/^<\?xml.+\?\s*>//go; - $xml =~ s/>\s+entityize($xml); - $xml =~ s/[\x00-\x1f]//go; - - my $bib = new Fieldmapper::serial::record_entry; - $bib->id($id); - $bib->record($record); - $bib->active('t'); - $bib->deleted('f'); - $bib->marc($xml); - $bib->creator($user); - $bib->create_date('now'); - $bib->editor($user); - $bib->edit_date('now'); - $bib->last_xact_id('IMPORT-'.$starttime); - - if ($libmap) { - my $lib_id = get_library_id($rec); - if ($lib_id) { - $bib->owning_lib($lib_id); - } - } - - print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; - - $count++; - - if (!$quiet && !($count % 20)) { - print STDERR "\r$count\t". $count / (time - $starttime); - } -} - -# Generate a hash of library names (as found in the 852b in the MFHD record) to -# integers representing actor.org_unit ID values -sub map_libraries_to_ID { - my $map_filename = shift; - - my %lib_id_map; - - open(MAP_FH, '<', $map_filename) or die "Could not load [$map_filename] $!"; - while () { - my ($lib, $id) = $_ =~ /^(.*?)\t(.*?)$/; - $lib_id_map{$lib} = $id; - } - - return \%lib_id_map; -} - -# Look up the actor.org_unit.id value for this library name -sub get_library_id { - my $record = shift; - - my $lib_name = $record->field('852')->subfield('b'); - my $lib_id = $lib_id_map->{$lib_name}; - - return $lib_id; -} - -# Get the actor.usr.id value for the given username -sub get_user_id { - my $username = shift; - - my ($result, $evt); - - $result = $U->cstorereq( - 'open-ils.cstore.direct.actor.user.search', - { usrname => $username, deleted => 'f' } - ); - $evt = OpenILS::Event->new('ACTOR_USR_NOT_FOUND') unless $result; - - return ($result, $evt); -} - -# Get the biblio.record_entry.id value for the given identifier; note that this -# approach uses a wildcard to match anything that precedes the identifier value -sub map_id_to_bib { - my $record = shift; - - my ($result, $evt); - - my %search = ( - tag => $bibfield, - value => { ilike => '%' . $record } - ); - - if ($bibsubfield) { - $search{'subfield'} = $bibsubfield; - } - - $result = $U->cstorereq( - 'open-ils.cstore.direct.metabib.full_rec.search', \%search - ); - $evt = OpenILS::Event->new('METABIB_FULL_REC_NOT_FOUND') unless $record; - - return ($result, $evt); -} - -__END__ - -=head1 NAME - -marc2sre.pl - Convert MARC Format for Holdings Data (MFHD) records to SRE -(serial.record_entry) JSON objects - -=head1 SYNOPSIS - -C [B<--config>=I] -[[B<--idfield>=I[ B<--idsubfield>=I]] [B<--start_id>=I] -[B<--user>=I] [B<--marctype>=I] -[[B<--file>=I[, ...]] [B<--libmap>=I] [B<--quiet>=I] -[[B<--bibfield>=I [B<--bibsubfield>=]] - -=head1 DESCRIPTION - -For one or more files containing MFHD records, iterate through the records -and generate SRE (serial.record_entry) JSON objects. - -=head1 OPTIONS - -=over - -=item * B<-c> I, B<--config>=I - -Specifies the OpenSRF configuration file used to connect to the OpenSRF router. -Defaults to F - -=item * B<--idfield> I - -Specifies the MFHD field where the identifier of the corresponding -bibliographic record is found. Defaults to '004'. - -=item * B<--idsubfield> I - -Specifies the MFHD subfield, if any, where the identifier of the corresponding -bibliographic record is found. This option is ignored unless it is accompanied -by the B<--idfield> option. Defaults to null. - -=item * B<--bibfield> I - -Specifies the field in the bibliographic record that holds the identifier -value. Defaults to null. - -=item * B<--bibsubfield> I - -Specifies the subfield in the bibliographic record, if any, that holds the -identifier value. This option is ignored unless it is accompanied by the -B<--bibfield> option. Defaults to null. - -=item * B<-u> I, B<--user>=I - -Specifies the Evergreen user that will own these serial records. - -=item * B<-m> I, B<--marctype>=I - -Specifies whether the files containg the MFHD records are in MARC21 ('MARC21') -or MARC21XML ('XML') format. Defaults to MARC21. - -=item * B<-l> I, B<--libmap>=I - -Points to a file to containing a mapping of library names to integers. -The integer represents the actor.org_unit.id value of the library. This enables -us to generate an ingest file that does not subsequently need to manually -manipulated. - -The library name must correspond to the 'b' subfield of the 852 field. -Well, it does not have to, but you will have to modify this script -accordingly. - -The format of the map file should be the name of the library, followed -by a tab, followed by the desired numeric ID of the library. For example: - -BR1 4 -BR2 5 - -=item * B<-q>, B<--quiet> - -Suppresses the record counter output. - -=back - -=head1 EXAMPLES - - marc2sre.pl --idfield 004 --bibfield 035 --bibsubfield a --user cat1 serial_holding.xml - -Processes MFHD records in the B file. The script pulls the -bibliographic record identifier from the 004 control field of the MFHD record -and searches for a matching value in the bibliographic record in data field -035, subfield a. The "cat1" user will own the processed MFHD records. - -=head1 AUTHOR - -Dan Scott - -=head1 COPYRIGHT AND LICENSE - -Copyright 2010-2011 by Dan Scott - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -=cut diff --git a/Open-ILS/src/extras/import/marc2sre.pl.in b/Open-ILS/src/extras/import/marc2sre.pl.in new file mode 100755 index 000000000..a783d10c6 --- /dev/null +++ b/Open-ILS/src/extras/import/marc2sre.pl.in @@ -0,0 +1,333 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use OpenSRF::System; +use OpenSRF::EX qw/:try/; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Application::AppUtils; +use OpenILS::Event; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::JSON; +use Unicode::Normalize; + +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; +use Pod::Usage; + +MARC::Charset->ignore_errors(1); + +# Command line options, with applicable defaults +my ($idsubfield, $bibfield, $bibsubfield, @files, $libmap, $quiet, $help); +my $idfield = '004'; +my $count = 1; +my $user = 'admin'; +my $config = '@sysconfdir@/opensrf_core.xml'; +my $marctype = 'USMARC'; + +my $parse_options = GetOptions( + 'idfield=s' => \$idfield, + 'idsubfield=s' => \$idsubfield, + 'bibfield=s'=> \$bibfield, + 'bibsubfield=s'=> \$bibsubfield, + 'startid=i'=> \$count, + 'user=s' => \$user, + 'config=s' => \$config, + 'marctype=s' => \$marctype, + 'file=s' => \@files, + 'libmap=s' => \$libmap, + 'quiet' => \$quiet, + 'help' => \$help, +); + +if (!$parse_options or $help) { + pod2usage(0); +} + +@files = @ARGV if (!@files); + +my $U = 'OpenILS::Application::AppUtils'; +my @ses; +my @req; +my %processing_cache; +my $lib_id_map; +if ($libmap) { + $lib_id_map = map_libraries_to_ID($libmap); +} + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +my ($result, $evt) = get_user_id($user); +if ($evt || !$result->id) { + print("Could not retrieve user with user name '$user'\n"); + exit(0); +} + +$user = $result->id; + +select STDERR; $| = 1; +select STDOUT; $| = 1; + +my $batch = new MARC::Batch ( $marctype, @files ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + my $id = $count; + my $record_field; + if ($idsubfield) { + $record_field = $rec->field($idfield, $idsubfield); + } else { + $record_field = $rec->field($idfield); + } + + # Start by just using the counter as the record ID + my $record = $count; + + # If we have identified a location for the bib record ID, grab that value + if ($record_field) { + $record = $record_field->data; + } + + # If we didn't get a bib record ID, skip and move on to the next MFHD record + if (!$record) { + print STDERR "Could not find a bibliographic record ID link for record $count\n"; + next; + } + + # If we have been given bibfield / bibsubfield values, use those to find + # a matching bib record for $record and use _that_ as our record instead + if ($bibfield) { + my ($result, $evt) = map_id_to_bib($record); + if ($evt || !$result || !$result->record) { + print STDERR "Could not find matching bibliographic record for record $count\n"; + next; + } + $record = $result->record; + } else { + # Strip the identifier down to a usable integer + $record =~ s/^.*?(\d+).*?$/$1/o; + } + + (my $xml = $rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + my $bib = new Fieldmapper::serial::record_entry; + $bib->id($id); + $bib->record($record); + $bib->active('t'); + $bib->deleted('f'); + $bib->marc($xml); + $bib->creator($user); + $bib->create_date('now'); + $bib->editor($user); + $bib->edit_date('now'); + $bib->last_xact_id('IMPORT-'.$starttime); + + if ($libmap) { + my $lib_id = get_library_id($rec); + if ($lib_id) { + $bib->owning_lib($lib_id); + } + } + + print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; + + $count++; + + if (!$quiet && !($count % 20)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } +} + +# Generate a hash of library names (as found in the 852b in the MFHD record) to +# integers representing actor.org_unit ID values +sub map_libraries_to_ID { + my $map_filename = shift; + + my %lib_id_map; + + open(MAP_FH, '<', $map_filename) or die "Could not load [$map_filename] $!"; + while () { + my ($lib, $id) = $_ =~ /^(.*?)\t(.*?)$/; + $lib_id_map{$lib} = $id; + } + + return \%lib_id_map; +} + +# Look up the actor.org_unit.id value for this library name +sub get_library_id { + my $record = shift; + + my $lib_name = $record->field('852')->subfield('b'); + my $lib_id = $lib_id_map->{$lib_name}; + + return $lib_id; +} + +# Get the actor.usr.id value for the given username +sub get_user_id { + my $username = shift; + + my ($result, $evt); + + $result = $U->cstorereq( + 'open-ils.cstore.direct.actor.user.search', + { usrname => $username, deleted => 'f' } + ); + $evt = OpenILS::Event->new('ACTOR_USR_NOT_FOUND') unless $result; + + return ($result, $evt); +} + +# Get the biblio.record_entry.id value for the given identifier; note that this +# approach uses a wildcard to match anything that precedes the identifier value +sub map_id_to_bib { + my $record = shift; + + my ($result, $evt); + + my %search = ( + tag => $bibfield, + value => { ilike => '%' . $record } + ); + + if ($bibsubfield) { + $search{'subfield'} = $bibsubfield; + } + + $result = $U->cstorereq( + 'open-ils.cstore.direct.metabib.full_rec.search', \%search + ); + $evt = OpenILS::Event->new('METABIB_FULL_REC_NOT_FOUND') unless $record; + + return ($result, $evt); +} + +__END__ + +=head1 NAME + +marc2sre.pl - Convert MARC Format for Holdings Data (MFHD) records to SRE +(serial.record_entry) JSON objects + +=head1 SYNOPSIS + +C [B<--config>=I] +[[B<--idfield>=I[ B<--idsubfield>=I]] [B<--start_id>=I] +[B<--user>=I] [B<--marctype>=I] +[[B<--file>=I[, ...]] [B<--libmap>=I] [B<--quiet>=I] +[[B<--bibfield>=I [B<--bibsubfield>=]] + +=head1 DESCRIPTION + +For one or more files containing MFHD records, iterate through the records +and generate SRE (serial.record_entry) JSON objects. + +=head1 OPTIONS + +=over + +=item * B<-c> I, B<--config>=I + +Specifies the OpenSRF configuration file used to connect to the OpenSRF router. +Defaults to F<@sysconfdir@/opensrf_core.xml> + +=item * B<--idfield> I + +Specifies the MFHD field where the identifier of the corresponding +bibliographic record is found. Defaults to '004'. + +=item * B<--idsubfield> I + +Specifies the MFHD subfield, if any, where the identifier of the corresponding +bibliographic record is found. This option is ignored unless it is accompanied +by the B<--idfield> option. Defaults to null. + +=item * B<--bibfield> I + +Specifies the field in the bibliographic record that holds the identifier +value. Defaults to null. + +=item * B<--bibsubfield> I + +Specifies the subfield in the bibliographic record, if any, that holds the +identifier value. This option is ignored unless it is accompanied by the +B<--bibfield> option. Defaults to null. + +=item * B<-u> I, B<--user>=I + +Specifies the Evergreen user that will own these serial records. + +=item * B<-m> I, B<--marctype>=I + +Specifies whether the files containg the MFHD records are in MARC21 ('MARC21') +or MARC21XML ('XML') format. Defaults to MARC21. + +=item * B<-l> I, B<--libmap>=I + +Points to a file to containing a mapping of library names to integers. +The integer represents the actor.org_unit.id value of the library. This enables +us to generate an ingest file that does not subsequently need to manually +manipulated. + +The library name must correspond to the 'b' subfield of the 852 field. +Well, it does not have to, but you will have to modify this script +accordingly. + +The format of the map file should be the name of the library, followed +by a tab, followed by the desired numeric ID of the library. For example: + +BR1 4 +BR2 5 + +=item * B<-q>, B<--quiet> + +Suppresses the record counter output. + +=back + +=head1 EXAMPLES + + marc2sre.pl --idfield 004 --bibfield 035 --bibsubfield a --user cat1 serial_holding.xml + +Processes MFHD records in the B file. The script pulls the +bibliographic record identifier from the 004 control field of the MFHD record +and searches for a matching value in the bibliographic record in data field +035, subfield a. The "cat1" user will own the processed MFHD records. + +=head1 AUTHOR + +Dan Scott + +=head1 COPYRIGHT AND LICENSE + +Copyright 2010-2011 by Dan Scott + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +=cut diff --git a/Open-ILS/src/extras/import/parallel_pg_loader.pl b/Open-ILS/src/extras/import/parallel_pg_loader.pl deleted file mode 100755 index 43ddaa9ce..000000000 --- a/Open-ILS/src/extras/import/parallel_pg_loader.pl +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use lib '/openils/lib/perl5/'; - -use OpenSRF::System; -use OpenSRF::EX qw/:try/; -use OpenSRF::Utils::SettingsClient; -use OpenILS::Utils::Fieldmapper; -use OpenSRF::Utils::JSON; -use FileHandle; - -use Time::HiRes qw/time/; -use Getopt::Long; - -my @files; -my ($config, $output, @auto, @order, @wipe) = - ('/openils/conf/opensrf_core.xml', 'pg_loader-output'); -my $nocommit = 0; - -GetOptions( - 'config=s' => \$config, - 'output=s' => \$output, - 'wipe=s' => \@wipe, - 'autoprimary=s' => \@auto, - 'order=s' => \@order, - 'nocommit=i' => \$nocommit, -); - -my $pwd = `pwd`; -chop($pwd); - -my %lineset; -my %fieldcache; - -OpenSRF::System->bootstrap_client( config_file => $config ); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); - -my $main_out = FileHandle->new(">$output.sql") if ($output); - -binmode($main_out,'utf8'); - -$main_out->print("SET CLIENT_ENCODING TO 'UNICODE';\n\n"); -$main_out->print("BEGIN;\n\n"); - -my %out_files; -for my $h (@order) { - $out_files{$h} = FileHandle->new(">$output.$h.sql"); - binmode($out_files{$h},'utf8'); -} - -my $count = 0; -my $starttime = time; -my $after_commit = ''; -while ( my $rec = <> ) { - next unless ($rec); - - my $row; - try { - $row = OpenSRF::Utils::JSON->JSON2perl($rec); - } catch Error with { - my $e = shift; - warn "\n\n !!! Error : $e \n\n at or around line $count\n"; - }; - next unless ($row); - - my $class = $row->class_name; - my $hint = $row->json_hint; - - next unless ( grep /$hint/, @order ); - - if (!$fieldcache{$hint}) { - my @cols = $row->real_fields; - if (grep { $_ eq $hint} @auto) { - @cols = grep { $_ ne $class->Identity } @cols; - } - - $fieldcache{$hint} = - { table => $class->Table, - sequence => $class->Sequence, - pkey => $class->Identity, - fields => \@cols, - }; - - #XXX it burnnnsssessss - $fieldcache{$hint}{table} =~ s/\.full_rec/.real_full_rec/o if ($hint eq 'mfr'); - - my $fields = join(',', @{ $fieldcache{$hint}{fields} }); - $main_out->print( "DELETE FROM $fieldcache{$hint}{table};\n" ) if (grep {$_ eq $hint } @wipe); - # Speed up loading of bib records - $main_out->print( "COPY $fieldcache{$hint}{table} ($fields) FROM '$pwd/$output.$hint.sql';\n" ); - - } - - my $line = [map { $row->$_ } @{ $fieldcache{$hint}{fields} }]; - my @data; - my $x = 0; - for my $d (@$line) { - if (!defined($d)) { - $d = '\N'; - } else { - $d =~ s/\f/\\f/gos; - $d =~ s/\n/\\n/gos; - $d =~ s/\r/\\r/gos; - $d =~ s/\t/\\t/gos; - $d =~ s/\\/\\\\/gos; - } - if ($hint eq 'bre' and $fieldcache{$hint}{fields}[$x] eq 'quality') { - $d = int($d) if ($d ne '\N'); - } - push @data, $d; - $x++; - } - $out_files{$hint}->print( join("\t", @data)."\n" ); - - if (!($count % 500)) { - print STDERR "\r$count\t". $count / (time - $starttime); - } - - $count++; -} - -for my $hint (@order) { - next if (grep { $_ eq $hint} @auto); - next unless ($fieldcache{$hint}{sequence}); - $after_commit .= "SELECT setval('$fieldcache{$hint}{sequence}'::TEXT, (SELECT MAX($fieldcache{$hint}{pkey}) FROM $fieldcache{$hint}{table}), TRUE);\n"; -} - -if (grep /^mfr$/, %out_files) { - $main_out->print("SELECT reporter.enable_materialized_simple_record_trigger();\n"); - $main_out->print("SELECT reporter.disable_materialized_simple_record_trigger();\n"); -} - -$main_out->print("COMMIT;\n\n") unless $nocommit; -$main_out->print($after_commit); -$main_out->close; - diff --git a/Open-ILS/src/extras/import/parallel_pg_loader.pl.in b/Open-ILS/src/extras/import/parallel_pg_loader.pl.in new file mode 100755 index 000000000..f276f0b8d --- /dev/null +++ b/Open-ILS/src/extras/import/parallel_pg_loader.pl.in @@ -0,0 +1,136 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use OpenSRF::System; +use OpenSRF::EX qw/:try/; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::JSON; +use FileHandle; + +use Time::HiRes qw/time/; +use Getopt::Long; + +my @files; +my ($config, $output, @auto, @order, @wipe) = + ('@sysconfdir@/opensrf_core.xml', 'pg_loader-output'); +my $nocommit = 0; + +GetOptions( + 'config=s' => \$config, + 'output=s' => \$output, + 'wipe=s' => \@wipe, + 'autoprimary=s' => \@auto, + 'order=s' => \@order, + 'nocommit=i' => \$nocommit, +); + +my $pwd = `pwd`; +chop($pwd); + +my %lineset; +my %fieldcache; + +OpenSRF::System->bootstrap_client( config_file => $config ); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +my $main_out = FileHandle->new(">$output.sql") if ($output); + +binmode($main_out,'utf8'); + +$main_out->print("SET CLIENT_ENCODING TO 'UNICODE';\n\n"); +$main_out->print("BEGIN;\n\n"); + +my %out_files; +for my $h (@order) { + $out_files{$h} = FileHandle->new(">$output.$h.sql"); + binmode($out_files{$h},'utf8'); +} + +my $count = 0; +my $starttime = time; +my $after_commit = ''; +while ( my $rec = <> ) { + next unless ($rec); + + my $row; + try { + $row = OpenSRF::Utils::JSON->JSON2perl($rec); + } catch Error with { + my $e = shift; + warn "\n\n !!! Error : $e \n\n at or around line $count\n"; + }; + next unless ($row); + + my $class = $row->class_name; + my $hint = $row->json_hint; + + next unless ( grep /$hint/, @order ); + + if (!$fieldcache{$hint}) { + my @cols = $row->real_fields; + if (grep { $_ eq $hint} @auto) { + @cols = grep { $_ ne $class->Identity } @cols; + } + + $fieldcache{$hint} = + { table => $class->Table, + sequence => $class->Sequence, + pkey => $class->Identity, + fields => \@cols, + }; + + #XXX it burnnnsssessss + $fieldcache{$hint}{table} =~ s/\.full_rec/.real_full_rec/o if ($hint eq 'mfr'); + + my $fields = join(',', @{ $fieldcache{$hint}{fields} }); + $main_out->print( "DELETE FROM $fieldcache{$hint}{table};\n" ) if (grep {$_ eq $hint } @wipe); + # Speed up loading of bib records + $main_out->print( "COPY $fieldcache{$hint}{table} ($fields) FROM '$pwd/$output.$hint.sql';\n" ); + + } + + my $line = [map { $row->$_ } @{ $fieldcache{$hint}{fields} }]; + my @data; + my $x = 0; + for my $d (@$line) { + if (!defined($d)) { + $d = '\N'; + } else { + $d =~ s/\f/\\f/gos; + $d =~ s/\n/\\n/gos; + $d =~ s/\r/\\r/gos; + $d =~ s/\t/\\t/gos; + $d =~ s/\\/\\\\/gos; + } + if ($hint eq 'bre' and $fieldcache{$hint}{fields}[$x] eq 'quality') { + $d = int($d) if ($d ne '\N'); + } + push @data, $d; + $x++; + } + $out_files{$hint}->print( join("\t", @data)."\n" ); + + if (!($count % 500)) { + print STDERR "\r$count\t". $count / (time - $starttime); + } + + $count++; +} + +for my $hint (@order) { + next if (grep { $_ eq $hint} @auto); + next unless ($fieldcache{$hint}{sequence}); + $after_commit .= "SELECT setval('$fieldcache{$hint}{sequence}'::TEXT, (SELECT MAX($fieldcache{$hint}{pkey}) FROM $fieldcache{$hint}{table}), TRUE);\n"; +} + +if (grep /^mfr$/, %out_files) { + $main_out->print("SELECT reporter.enable_materialized_simple_record_trigger();\n"); + $main_out->print("SELECT reporter.disable_materialized_simple_record_trigger();\n"); +} + +$main_out->print("COMMIT;\n\n") unless $nocommit; +$main_out->print($after_commit); +$main_out->close; + diff --git a/configure.ac b/configure.ac index 88a84dcee..c1f5137f4 100644 --- a/configure.ac +++ b/configure.ac @@ -373,12 +373,19 @@ AC_CONFIG_FILES([Makefile Open-ILS/updates/Makefile Open-ILS/xul/staff_client/Makefile Open-ILS/src/extras/eg_config - Open-ILS/src/extras/fast-extract + Open-ILS/src/extras/import/marc2are.pl + Open-ILS/src/extras/import/marc2bre.pl + Open-ILS/src/extras/import/marc2sre.pl + Open-ILS/src/extras/import/parallel_pg_loader.pl Open-ILS/src/perlmods/Makefile Open-ILS/src/perlmods/lib/OpenILS/Utils/Cronscript.pm], [ if test -e "./Open-ILS/src/extras/eg_config"; then chmod 755 Open-ILS/src/extras/eg_config; fi; if test -e "./Open-ILS/src/extras/fast-extract"; then chmod 755 Open-ILS/src/extras/fast-extract; fi; + if test -e "./Open-ILS/src/extras/import/marc2are.pl"; then chmod 755 Open-ILS/src/extras/import/marc2are.pl; fi; + if test -e "./Open-ILS/src/extras/import/marc2bre.pl"; then chmod 755 Open-ILS/src/extras/import/marc2bre.pl; fi; + if test -e "./Open-ILS/src/extras/import/marc2sre.pl"; then chmod 755 Open-ILS/src/extras/import/marc2sre.pl; fi; + if test -e "./Open-ILS/src/extras/import/parallel_pg_loader.pl"; then chmod 755 Open-ILS/src/extras/import/parallel_pg_loader.pl; fi; ]) AC_OUTPUT