forked from KorfLab/Centromere_repeat_paper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathftp_trace_reads.pl
executable file
·335 lines (266 loc) · 10.2 KB
/
ftp_trace_reads.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/perl
#
# ftp_trace_reads.pl
#
# A script to download trace reads and ancillary information of selected species from NCBI trace archive
#
# Last updated by: $Author$
# Last updated on: $Date$
use strict;
use warnings;
use Getopt::Long;
use Net::FTP;
use Cwd;
use List::Util 'shuffle';
###############################
#
# Set some environment variables
#
################################
$SIG{'INT'} = 'INT_handler'; # capture signal interrupts
$ENV{FTP_PASSIVE} = 1; # Need this or else FTP won't work in (the default) active mode!
###############################
#
# Command-line options
#
################################
my $max_files; # maximum number of files to download for any species
my $debug; # whether to turn on debugging in FTP module
my $timeout; # set timeout value for Net::FTP module
my $sleep; # how long to sleep for before retrying ftp download
my $max_attempts; # how many attempts to download one file before giving up
my $species_list; # optionally specify a file which contains species (quicker than looking up via separate script)
my $prog; # specify path to a program that will produce list of (eukaryotic) species
my $ignore_processed; # check to see what files have previously been processed and ignore those even if gzip files are present
my $ls; # just list what you will be fetching without actually fetching it
my $one_file; # just grab files with the specified number
my $one_species; # just grab files from the specified species
my $help; # display help
GetOptions ("max_files:i" => \$max_files,
"debug" => \$debug,
"timeout:i" => \$timeout,
"sleep:i" => \$sleep,
"max_attempts:i" => \$max_attempts,
"species_list:s" => \$species_list,
"prog=s" => \$prog,
"ignore_processed" => \$ignore_processed,
"ls" => \$ls,
"one_file=s" => \$one_file,
"one_species=s" => \$one_species);
# a quick couple of sanity checks on command line options
my $usage = "
usage: ftp_trace_reads.pl <options>
-species_list <file> : supply a file containing species names to grab
-ls : just list files for each species but do not fetch them
-one_species <species> : name a single species to get only data for that species
-one_file <xxx> : specify a 3 digit number to grab only that file for selected species
-max_files <int> : how many files to try to download? Default = 5
-ignore_processed : skip any files that are listed in a file called 'trace_archive_processed_files.txt'
-prog <script> : specify a program that will return a list of species to get (e.g. find_eukaryotes_in_trace_archive)
-max_attempts <int> : how many times should FTP try to grab a file before giving up
-timeout <int> : timeout value in seconds for Net::FTP module
-sleep <int> : how many seconds to sleep before retrying a fetch of files
-debug : turn on debugging output in FTP module
-help : this help
\n";
die "$usage" if (!$species_list && !$one_species && !$ls);
die "$usage" if ($help);
die "Use either -one_species <species name> or -species_list <file of species names>, but not both\n" if ($species_list && $one_species);
die "-one_file option must specify a 3 digit number (use leading zeroes if necessary)\n" if ($one_file && ($one_file !~ m/^[0-9]{3}$/));
###############################
#
# Set some default values
#
################################
$max_files = 5 if (!$max_files);
$timeout = 180 if (!$timeout);
$sleep = 10 if (!$sleep);
$max_attempts = 5 if (!$max_attempts);
if (!$debug){$debug = 0}
else{$debug = 1}
#############################################################
#
# Check for previously processed files
#
#############################################################
# this will be the file that the downstream script (parse_ftp_tracedb_data.pl script) will read
my $processed_file_name = "trace_archive_processed_files.txt";
# if -ignore option is being used we need to first get a list of previously processed files
my %previously_processed;
if($ignore_processed){
open(IN,"<$processed_file_name") or die "Can't find $processed_file_name file\n";
while(<IN>){
chomp;
my ($file) = split(/\s+/,$_);
$previously_processed{$file} = 1;
}
close(IN);
}
##############################################################
#
# Read in or generate a list of species to process
#
#############################################################
my @taxa = get_species_names();
########################
# BASIC FTP settings
########################
my $host = "ftp.ncbi.nlm.nih.gov";
my $user = "anonymous";
my $password = "krbradnam\@ucdavis.edu";
my $root = "pub/TraceDB";
# keep track of how many species did not have an exact file name match on FTP site
my $missing_counter = 0;
my $species_counter = 0;
my $ftp;
my $current_dir = getcwd;
##############################################################
#
# Main loop: ftp files for each species
#
#############################################################
SPECIES: foreach my $species (@taxa){
$species_counter++;
# make a directory for species if necessary
my $path;
$path = "$current_dir/$species";
system("mkdir $path") unless -e $path;
chdir $path;
print STDERR "Processing files for $species\n";
$ftp = Net::FTP->new($host, Debug => $debug, Timeout => $timeout) or die "Cannot connect to $host: $@\n",$ftp->message;
$ftp->login($user,$password) or die "Cannot login ", $ftp->message, "\n";
$ftp->binary;
my $dir = "$root/$species";
# if we just want to list files, can stop here
if($ls){
my @files = $ftp->ls("$dir/fasta.$species.[0-9]*.gz");
foreach my $file (@files){
$file =~ s/.*fasta/fasta/;
print "$file\n";
}
next SPECIES;
}
# need to find out how many files are in directory, grab all FASTA files and use array index of last file
# also keep count of how many species we don't get an exact name match for
my (@fasta);
unless(@fasta = $ftp->ls("$dir/fasta.$species.[0-9]*.gz")){
print STDERR "MISSING SPECIES: $species\n";
$missing_counter++;
next SPECIES;
}
my @file_indices;
foreach my $file (shuffle(@fasta)){
my ($index) = $file =~ m/.*(\d{3})\.gz/;
push(@file_indices,$index);
}
# if -one_file option is being used, can replace array with that one value
@file_indices = ($one_file) if ($one_file);
my $file_counter = 0;
FILE: foreach my $index (@file_indices){
$file_counter++;
# break out of loop if we have exceeded max number of pages
if ($file_counter > $max_files){
print STDERR "Maximum number of files ($max_files) has been exceeded, skipping to next species\n";
last FILE;
}
# grab files in pairs, fasta + anc file (they should pair up)
my ($fasta_return) = get_files($dir,$species,$index,"fasta",1);
print STDERR "get_files (FASTA) failed for $species $index\n" if (!$fasta_return);
my ($anc_return) = get_files($dir,$species,$index,"anc",1);
print STDERR "get_files (CLIP) failed for $species $index\n" if (!$anc_return);
}
# tidy up
$ftp->quit or die "Can't quit FTP",$ftp->message;
print STDERR "\n";
}
print STDERR "\n$missing_counter species (out of $species_counter) could not be found on FTP site, might be due to slight variations in species names\n\n" if ($missing_counter);
exit;
##############################################################
#
#
# T H E S U B R O U T I N E S
#
#
#############################################################
sub get_files{
my ($dir,$species,$index,$type,$attempt) = @_;
# format file name
my $file = "$type.$species.$index.gz";
# now check to see whether this file has been processed before (if -ignore_processed option is in use)
if($ignore_processed && $previously_processed{$file}){
print STDERR "$file has been processed before, skipping to next file\n";
return(1);
}
# now need to check that files for species actually exist on FTP site
if(defined $ftp->size("$dir/$file")){
# get size of file on FTP site
my $size = $ftp->size("$dir/$file");
# is file in local directory AND same size?
if(-e $file && (-s $file == $size)){
print STDERR "$file exists locally - skipping\n";
return(1);
}
# is file in local directory but different size?
elsif(-e $file && (-s $file != $size)){
print STDERR "FILE INCOMPLETE: refetching $file number $index, attempt number $attempt\n";
}
# if we get here must be a new file to download
else{
print STDERR "fetching $file number $index, attempt number $attempt\n";
}
# attempt to get file and use eval statement to catch any timeouts
eval{$ftp->get("$dir/$file") or die "Can't grab $file\n",$ftp->message};
if ($@ =~ /Timeout/){
# catching code goes here
print STDERR "$@\n";
return(0) if ($attempt > $max_attempts);
print STDERR "Sleeping for $sleep seconds, and then retrying\n";
sleep($sleep);
get_files($dir,$species,$index,$type,++$attempt);
}
}
# or give up
else{
print STDERR "MISSING FILE: $file not present on FTP site\n";
# can't remember why I am returning 1 here...there was a reason
return(1);
}
# if we get here, then we should have downloaded a file sucessfully
return(1);
}
sub get_species_names{
# either use a supplied list of species names, use the -one_species option, or run another program to generate them
my @taxa;
if($species_list or $one_species){
if($species_list){
print STDERR "Fetching list of species from $species_list\n\n";
open (IN,"<$species_list") or die "Can't find file specified by -species_file: $species_list\n";
while(my $species = <IN>){
# skip blank-ish likes
next unless $species =~ m/\w+/;
# get species name in correct format (should already be lower case)
chomp($species);
$species = lc($species);
$species =~ s/ /_/g;
push(@taxa,$species);
}
close(IN);
}
elsif($one_species){
@taxa = ($one_species);
}
}
else{
print STDERR "Fetching list of eukaryotes by using $prog\n\n";
@taxa = `$prog` or die "Can't run $prog\n";
}
return(@taxa);
}
# signal event handler in case of interrupts (Ctrl+C)
sub INT_handler {
my $date = `date`;
chomp($date);
print "\n\nSCRIPT INTERRUPTED at $date\n";
print "\n$missing_counter species (out of $species_counter) could not be found on FTP site, might be due to slight variations in species names\n\n";
exit(0);
}