#!/usr/local/bin/perl -w # -*- cperl -*- =head1 NAME getRange.pl =head1 SYNOPSYS RCS:$Id$ =head1 DESCRIPTION =head1 HISTORY ORIGIN: created from templateApp.pl version 3.4 by Min-Yen Kan RCS:$Log$ =cut require 5.0; use Getopt::Std; use strict 'vars'; # use diagnostics; ### USER customizable section my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g; $tmpfile .= $$ . time; if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable $tmpfile = "/tmp/" . $tmpfile; $0 =~ /([^\/]+)$/; my $progname = $1; my $outputVersion = "1.0"; # Citeseer sf / url conversion my $citeseerPapersDir = "/export/bulk/citeseer/cs/papers/"; my %CSURL2FS = ('/' => 'zSz', '$' => 'zDz', '+' => 'zPz', '*' => 'zAz', '@' => 'zTz', '%' => 'zCz', '?' => 'zQz', '\\' => 'zBz', 'zSz' => '/', 'zDz' => '$', 'zPz' => '+', 'zAz' => '*', 'zTz' => '@', 'zCz' => '%', 'zQz' => '?', 'zBz' => '\\'); ### END user customizable section ### Ctrl-C handler sub quitHandler { print STDERR "\n# $progname fatal\t\tReceived a 'SIGINT'\n# $progname - exiting cleanly\n"; exit; } ### HELP Sub-procedure sub Help { print STDERR "usage: $progname -h\t\t\t\t[invokes help]\n"; print STDERR " $progname -v\t\t\t\t[invokes version]\n"; print STDERR " $progname [-q] start-end csMetadata_tsv_file\n"; print STDERR "Options:\n"; print STDERR "\t-q\tQuiet Mode (don't echo license)\n"; print STDERR "\n"; print STDERR "Will accept input on STDIN as a single file.\n"; print STDERR "\n"; } ### VERSION Sub-procedure sub Version { if (system ("perldoc $0")) { die "Need \"perldoc\" in PATH to print version information"; } exit; } sub License { print STDERR "# Copyright 2005 \251 by Min-Yen Kan\n"; } ### ### MAIN program ### my $cmdLine = $0 . " " . join (" ", @ARGV); if ($#ARGV == -1) { # invoked with no arguments, possible error in execution? Help(); exit(0); } $SIG{'INT'} = 'quitHandler'; getopts ('hqv'); our ($opt_q, $opt_v, $opt_h); # use (!defined $opt_X) for options with arguments if (!$opt_q) { License(); } # call License, if asked for if ($opt_v) { Version(); exit(0); } # call Version, if asked for if ($opt_h) { Help(); exit (0); } # call help, if asked for my $range = shift; my ($start,$end) = split (/\-/, $range); if (!$opt_q) { print STDERR "# starting at $start, ending at $end\n"; } ## standardize input stream (either STDIN on first arg on command line) my $fh; my $filename; if ($filename = shift) { NEWFILE: if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; } open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\""; $fh = "IF"; } else { $filename = ""; $fh = "STDIN"; } `mkdir /tmp/$range`; my @elts; my $mode = 0; my $numRecords = 0; open (OF, ">/tmp/$range/list.tsv"); while (<$fh>) { if (/^\#/) { next; } # skip comments elsif (/^\s+$/) { next; } # skip blank lines else { chomp; @elts = split(/\t/); if ($mode == 0 && $elts[0] eq $start) { $mode = 1; print STDERR "# found start\n"; } if ($mode == 1) { my $dir = $elts[6]; # $dir =~ s/cs/cs_overflow/g; my $paperLoc = &noExtensionWildCard(&url2sf($elts[5])); print "cp \"$citeseerPapersDir/$dir/$paperLoc\".* /tmp/$range/\n"; `cp "$citeseerPapersDir/$dir/$paperLoc".* /tmp/$range/`; print OF "$elts[0]\t$dir\t$elts[5]\t$paperLoc"; $numRecords++; } if ($mode == 1 && $elts[0] eq $end) { $mode = 0; print STDERR "# found end -- $numRecords exported\n"; last; } if ($elts[0] % 100 == 0) { print STDERR "[$elts[0]]"; } } } close (OF); close ($fh); # create zip file `cd /tmp; zip -r /tmp/$range.zip $range`; #`rm -Rf /tmp/$range`; ### ### END of main program ### sub url2sf { my $l = shift; $l =~ s/([\/\$\+\*\@\%\?])/$CSURL2FS{$1}/g; return $l; } sub noExtensionWildCard { my $l = shift; $l =~ s/\.(pdf|ps|ps.gz|ps.z|z|pdf.z|pdf.gz)$//ig; return $l; }