Why does my Perl script produce corrupted output with large files on Windows?
I am new to Perl and am having a very weird print issue.
The Perl program runs on Windows XP. It first executes a SQL then loops through the results and outputs to 5 files via 5 sub routines. The 5 files are to be loaded up to a database, so it uses |
as the delimiter. Each sub routine will have something like the following.
print outfile $array[field1] . '|' . $array[field2] . '|' . $array[field3] . "\n";
The weird thing is sometimes the program outputs OK. Sometimes, the output is corrupted, e.g. line feed is missing after some point, or the values from array are not correct.
I am wondering if it is something to do with memory. The output file sizes ranges from 500MB to 9GB. The program does read the output from SQL one record at a time and write one record at a time too.
Here is the complete Perl script.
#!/usr/bin/perl
use DBI;
use DBD::Oracle;
# Constants:
use constant field0 => 0;
use constant field1 => 1;
use constant field2 => 2;
use constant field3 => 3;
use constant field4 => 4;
use constant field5 => 5;
use constant field6 => 6;
use constant field7 => 7;
use constant field8 => 8;
use constant field9 => 9;
use constant field10 => 10;
use constant field11 => 11;
use constant field12 => 12;
use constant field13 => 13;
use constant field14 => 14;
use constant field15 => 15;
use constant field16 => 16;
use constant field17 => 17;
use constant field18 => 18;
use constant field19 => 19;
use constant field20 => 20;
use constant field21 => 21;
use constant field22 => 22;
use constant field23 => 23;
use constant field24 => 24;
use constant field25 => 25;
use constant field26 => 26;
use constant field27 => 27;
use constant field28 => 28;
use constant field29 => 29;
use constant field30 => 30;
use constant field31 => 31;
use constant field32 => 32;
use constant field33 => 33;
use constant field34 => 34;
use constant field35 => 35;
use constant field36 => 36;
use constant field37 => 37;
use constant field38 => 38;
use constant field39 => 39;
use constant field40 => 40;
use constant field41 => 41;
# Capture Directory Path from Environment Variable:
my $DIRECTORY = $ENV{DATADIR};
# Process Counters:
my %fileCntr = (
ccr1 => 0,
ccr2 => 0,
ccr3 => 0,
ccr4 => 0,
ccr5 => 0
);
# Process Control Hashes:
my %xref = ();
# Process Control Variables:
my $diag = 0;
my $proc = 0;
my $ndcc = 0;
my $previous = "";
# Claims Extract array:
my @arr = ();
my $hdr = "";
# Accept/Parse DSS Connection String:
$ENV{PSWD} =~ /(.+)\/(.+)\@(.+)/;
my $USER = $1;
my $PASS = $2;
my $CONN = 'DBI:Oracle:' . $3;
# ALTER Date format:
my $ATL = qq(ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD');
# Database Connection:
my $dbh = DBI->connect( $CONN, $USER, $PASS, { RaiseError => 1, AutoCommit => 0 } );
$dbh->do($ATL); # Execute ALTER session.
my $SQL = qq(
SELECT ... here is a big sql query
);
# Open OUTPUT file for CCR processing:
open OUT1, ">$DIRECTORY/ccr1.dat" or die "Unable to open OUT1 file: $!\n";
open OUT2, ">$DIRECTORY/ccr2.dat" or die "Unable to open OUT2 file: $!\n";
open OUT3, ">$DIRECTORY/ccr3.dat" or die "Unable to open OUT3 file: $!\n";
open OUT4, ">$DIRECTORY/ccr4.dat" or die "Unable to open OUT4 file: $!\n";
open OUT5, ">$DIRECTORY/ccr5.dat" or die "Unable to open OUT5 file: $!\n";
# Redirect STDOUT to log file:
open STDOUT, ">$DIRECTORY/ccr.log" or die "Unable to open LOG file: $!\n";
# Prepare $SQL for execution:
my $sth = $dbh->prepare($SQL);
$sth->execute();
# Produce out files:
{
local $, = "|";
local $\ = "\n";
while (@arr = $sth->fetchrow_array)
{
# Direct Write of CCR1&2 records:
&BuildCCR12();
# Write and Wipe CCR3 HASH Table:
&WriteCCR3() unless ($arr[field0] == $previous);
&BuildCCR3();
# Loop processing for CCR4:
&BuildCCR4();
# Loop processing for CCR5:
&BuildCCR5();
}
}
# Print Record Counts for OUTPUT files:
foreach my $key (keys %fileCntr) { print "$key: " . $fileCntr{$key} . "\n"; }
# Terminate DB connection:
$sth->finish();
$开发者_StackOverflow社区dbh->disconnect();
# Close all output files:
close(OUT1); close(OUT2); close(OUT3);
close(OUT4); close(OUT5);
{
# Reassign Output End-of-record across subroutine block:
local $\ = "\n";
sub BuildCCR12
{
# Write CCR1 Table:
print OUT1 $arr[field6] . '|' . $arr[field7] . '|' . $arr[field5] . '|' .
$arr[field0] . '|' . $arr[field8] . '|' . $arr[field9] . '|' .
$arr[field10] . '|' . $arr[field11] . '|' . $arr[field12] . '|' .
$arr[field13] . '|' . $arr[field2] . '|' . $arr[field3] . '|' .
$arr[field40] . '|' . $arr[field16];
$fileCntr{ccr1}++;
# Write CCR2 Table:
unless ($arr[field17] eq '###########') {
print OUT2 ++$ndcc . "|" . $arr[field0] . "|" .
$arr[field6] . '|' . $arr[field7] . '|' .
$arr[field17] . '|' . $arr[field19] . '|' . $arr[field18] . '|' .
$arr[field2] . '|' . $arr[field3] . '|' . $arr[field39];
$fileCntr{ccr2}++;
}
}
sub WriteCCR3
{
unless ($previous == "")
{
# Produce ccr3 from DISTINCT combo listing:
foreach $key (keys %xref) { print OUT3 $xref{$key}; $fileCntr{ccr3}++; }
%xref = ();
}
}
sub BuildCCR3
{
# Spin off relationship:
for (my $i = field8; $i <= field13; $i++)
{
unless ($arr[$i] == -1)
{
$xref{$arr[field0] . "|" . $arr[$i]} = $arr[field0] . "|" . $arr[$i];
}
}
$previous = $arr[field0];
}
sub BuildCCR4
{
# Spin off relationship:
for (my $i = field26; $i <= field37; $i++)
{
my $sak = $arr[field0] . $arr[field6] . $arr[field7] . $arr[$i];
unless (($arr[$i] eq '#######') or ($arr[$i] eq '######')) {
print OUT4 ++$diag . '|' . $arr[field0] . '|' .
$arr[field6] . '|' .
$arr[field7] . '|' . $arr[$i];
$fileCntr{ccr4}++;
}
}
}
sub BuildCCR5
{
# Spin off field0/Procedure relationship:
for (my $i = field20; $i <= field23; $i++)
{
my $sak = $arr[field0] . $arr[field6] . $arr[field7] . $arr[$i];
unless ($arr[$i] eq '######' or $arr[$i] eq '####') {
print OUT5 ++$proc . '|' . $arr[field0] . '|' . $arr[field6] . '|' .
$arr[field7] . '|' . $arr[$i];
$fileCntr{ccr5}++;
}
}
}
}
The issue is with CCR3 output. After some point, the line feed disappears for some reason, and data got corrupted as if the line feed ate some of the output. Starting that point, it becomes 1 continuous line.
3260183|147845
3260183|78246
3260183|13898
3260183|184783
3260183|116315
3260183|184483262216|105843262217|1461703262217|175593262217|1360303262217
Another thing is this program will run close to 26 hours and while looping through the sql, is there any chance, the data can get messed up ? But it still won't explain why suddenly line feed does not work any more.
I tried to reduce clutter. First, the constants you define create a lot of clutter instead of helping with readability. If you had something like
use constant LICENSE_NO => 42;
I would understand, but if the constants are just going to correspond to integer array indices, then I don't see the point.
I also put all printing in a separate subroutine and added error checking to print
and close
statements.
I do not claim any of this is a solution to your problem, but this is where I would begin to actually debug. There might be some typos here, so watch out.
#!/usr/bin/perl
use warnings; use strict;
use DBI;
use File::Spec::Functions qw( catfile );
my @proc = qw(ccr1 ccr2 ccr3 ccr4 ccr5);
# Capture Directory Path from Environment Variable:
my $DIRECTORY = $ENV{DATADIR};
# Process Counters:
my %fileCntr = map { $_ => 0 } @proc;
# Process Control Hashes:
my %xref = ();
# Process Control Variables:
my $diag = 0;
my $proc = 0;
my $ndcc = 0;
my $previous = "";
# Claims Extract array:
my @arr = ();
my $hdr = "";
# Accept/Parse DSS Connection String:
my ($USER, $PASS, $CONN) = ($ENV{PSWD} =~ m{^(.+)/(.+)\@(.+)});
# ALTER Date format:
my $ATL = qq(ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD');
# Database Connection:
my $dbh = DBI->connect(
"DBI::Oracle:$CONN", $USER, $PASS,
{ RaiseError => 1, AutoCommit => 0 },
);
$dbh->do($ATL); # Execute ALTER session.
my $SQL = qq(
SELECT ... here is a big sql query
);
my %outh;
for my $proc ( @proc ) {
my $fn = catfile $DIRECTORY, "$proc.dat";
open $outh{ $proc }, '>', $fn
or die "Cannot open '$fn' for writing: $!";
}
# Redirect STDOUT to log file:
open STDOUT, '>', catfile($DIRECTORY, 'ccr.log')
or die "Unable to open LOG file: $!";
# Prepare $SQL for execution:
my $sth = $dbh->prepare($SQL);
$sth->execute();
# Produce out files:
while (my @arr = $sth->fetchrow_array) {
# Direct Write of CCR1&2 records:
BuildCCR12(\@arr);
# Write and Wipe CCR3 HASH Table:
WriteCCR3(\@arr) unless ($arr[0] == $previous);
BuildCCR3(\@arr);
# Loop processing for CCR4:
BuildCCR4(\@arr);
# Loop processing for CCR5:
BuildCCR5(\@arr);
}
# Print Record Counts for OUTPUT files:
foreach my $key (keys %fileCntr) {
printf "%s: %s\n", $key, $fileCntr{$key};
}
# Terminate DB connection:
$sth->finish();
$dbh->disconnect();
for my $proc (keys %outh) {
close $outh{ $proc } or die "Cannot close filehandle for '$proc': $!";
}
sub print_to {
my ($dest, $data) = @_;
my $fh = $outh{$dest};
print $fh join('|', @$data), "\n"
or die "Error writing to '$dest' file: $!";
$fileCntr{$dest}++;
return;
}
sub BuildCCR12 {
my ($arr) = @_;
print_to(ccr1 =>
[@{$arr}[6, 7, 5, 0, 8, 9, 10, 13, 2, 3, 40, 16]]);
if ($arr->[17] ne '###########') {
print_to(ccr2 =>
[++$ndcc, @{ $arr }[0, 6, 7, 17, 19, 18, 2, 3, 39]]);
}
return;
}
sub WriteCCR3 {
my ($arr) = @_;
unless ($previous) {
# Produce ccr3 from DISTINCT combo listing:
print_to(ccr3 => [ keys %xref ]);
%xref = ();
}
return;
}
sub BuildCCR3 {
my ($arr) = @_;
# Spin off relationship:
for my $i (8 .. 13) {
unless ($arr->[$i] == -1) {
my $k = join '|', @{ $arr }[0, $i];
$xref{ $k } = $k;
}
}
$previous = $arr->[0];
return;
}
sub BuildCCR4 {
my ($arr) = @_;
# Spin off relationship:
for my $i (26 .. 37) {
my $sak = join '|', @{ $arr }[0, 6, 7, $i];
my $v = $arr->[$i];
unless ( $v =~ /^#{6,7}\z/ ) {
print_to(ccr4 => [++$diag, @{ $arr }[0, 6, 7, $v]]);
}
}
return;
}
sub BuildCCR5 {
my ($arr) = @_;
# Spin off field0/Procedure relationship:
for my $i (20 .. 23) {
my $v = $arr[$i];
my $sak = join('', @{ $arr }[0, 6, 7], $v);
unless ($v eq '######' or $v eq '####') {
print_to(ccr5 => [++$proc, @{ $arr }[0, 6, 7], $v]);
}
}
return;
}
精彩评论