forked from KorfLab/Centromere_repeat_paper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHOS_finder.pl
executable file
·225 lines (173 loc) · 6.97 KB
/
HOS_finder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/perl
#
# HOS_finder.pl
#
# A script to report on possible Higher Order Structure of tandem repeats present in Trace Archive reads
#
# Last updated by: $Author$
# Last updated on: $Date$
use strict;
use warnings;
use Keith;
use FAlite;
use Getopt::Long;
###############################
#
# Command-line options
#
################################
my $dir; # use current directory for data files
my $min_copies; # minimum number of copies of repeat in read
my $min_length; # minimum length of repeat unit
my $offset; # what offset between overlapping repeats in allowed
my $help; # print help
GetOptions ("dir=s" => \$dir,
"copies:f" => \$min_copies,
"length:i" => \$min_length,
"offset:i" => \$offset,
"help" => \$help
);
###############################
#
# Set defaults
#
################################
# these are extra options that we can only implement through post-processing of raw trf output
$min_copies = 2 if (not defined $min_copies);
$min_length = 25 if (not defined $min_length);
$offset = 5 if (not defined $offset);
###############################
#
# Check command line options
#
################################
my $usage = "
usage: HOS_finder.pl [options]
options:
-dir <directory name containing fasta files>
-copies (minimum copies) [$min_copies]
-length (minimum repeat length) [$min_length]
-offset (offset allowed between overlapping repeats) [$offset]
-help : print this help
";
die "Specify -dir option to process all *.dat files in specified directory\n" unless ($dir);
die $usage if ($help);
###############################
#
# Variables
#
###############################
# hash to store sequence lengths
my %seq_to_length;
# one hash to store details of repeats with higher order structure (at least 3 different repeat lengths)
my %hos;
###############################
#
# Loop through input file(s)
#
###############################
print STDERR "\n# $0 started at ", `date`, "\n";
foreach my $file (glob("$dir/*processed_traces*fa*dat")){
print STDERR "Processing $file\n";
# skip zero byte files
next unless (-s $file);
# want to capture the (slightly processed) FASTA headers in the trf output
my $header;
open(my $data,"<", "$file") or die "Can't open $file\n";
my $seq_repeat_counter; # to keep track of when a sequence contains more than one repeat
my @seq_repeat_data; # to keep track of data on all repeats in a sequence to see if they are just multiples of each other
# need a generic counter for each sequence that might contain tandem repeats
# will form part of a unique ID later on
my $seq_count = 0;
REPEAT: while(<$data>){
# skip blank likes
next if (m/^$/);
# Need to know when we get to each new sequence in file (1 sequence might contain multiple tandem repeats)
if (m/^Sequence: (.*)/) {
$header = $1;
$seq_count++;
# reset certain counters and data
$seq_repeat_counter = 0;
@seq_repeat_data = ();
# and now move to next line in file
next REPEAT;
}
# the main output that we are interested in will all be on one line which starts with various
# numerical details of the repeat
if(m/^\d+ \d+ \d+ \d+\.\d /){
# capture repeat data into various variables (most will be unsused)
my ($start,$end,$period,$copies,$consensus,$matches,$indels,$score,$a,$c,$g,$t,$entropy,$repeat_seq) = split(/\s+/);
###################################################################
# now apply various criteria to see if we want to keep this repeat
###################################################################
# are there enough copies of the repeat in the read? N.B. you can have less than 2 copies present in TRF
next REPEAT unless ($copies >= $min_copies);
# is the repeat length at least $min_length (we want to avoid comparing things like 8 nt vs 4 nt repeats)
next REPEAT unless ($consensus >= $min_length);
# if we get this far then we will capture some of the repeat data in a hash
# this is to potentially compare to other repeats in the same sequence
$seq_repeat_counter++;
$seq_repeat_data[$seq_repeat_counter]{start} = "$start";
$seq_repeat_data[$seq_repeat_counter]{end} = "$end";
$seq_repeat_data[$seq_repeat_counter]{consensus} = "$consensus";
$seq_repeat_data[$seq_repeat_counter]{copies} = "$copies";
$seq_repeat_data[$seq_repeat_counter]{file} = "$file";
# add repeat info to %HOS hash, but only if it is first repeat in sequence
my $key = "$header,$seq_count";
my $info = "$start,$end,$consensus,$copies,$file,1";
push(@{$hos{$key}}, $info) if ($seq_repeat_counter == 1);
# no point going any further until we have seen at least 2 repeats within the current sequence
next REPEAT unless ($seq_repeat_counter > 1);
# loop through previous repeats. Want to see if one is a multiple of another
for (my $i = 1; $i < $seq_repeat_counter; $i++){
# want to see if current start/end coordinates are identical or within $offset nt of previous repeat
next unless (abs($start - $seq_repeat_data[$i]{start}) <= $offset);
next unless (abs($end - $seq_repeat_data[$i]{end}) <= $offset);
# $result is the ratio of the longest repeat to the shorter one, need longer repeat to be at least 1.85 x length of shorter repeat
# how we calculate this depends on whether current repeat length is longer than previous one or not
my $result;
if ($consensus < $seq_repeat_data[$i]{consensus}){
$result = $seq_repeat_data[$i]{'consensus'}/ $consensus;
} else{
$result = $consensus / $seq_repeat_data[$i]{'consensus'};
}
my $processed_result = $result;
$processed_result =~ s/\d+\.(\d+)/0\.$1/;
next unless (($processed_result < 0.15 or $processed_result > 0.85) && ($processed_result > 1.5));
# we should now be looking at a multiple repeat so add it to hash
my $info = "$start,$end,$consensus,$copies,$file,$result";
push(@{$hos{$key}}, $info);
# if we have seen a match, don't need to look any further
next REPEAT;
}
}
}
close($data);
}
# want a hash to keep track of how many 2nd-level, 3rd-level etc. HOS are preesnt
my %hos_count;
# now loop through %hos structure and write outptut to file
open(my $out, ">", "hos.out.csv") or die "Can't write to hos.out.csv\n";
print $out "HOS_ID\tHOS_LEVEL\tFILE\tSEQ_ID\tSTART\tEND\tREPEAT_LENGTH\tCOPIES\n";
my $counter = 1;
foreach my $key (keys %hos){
my $matches = @{$hos{$key}};
$hos_count{$matches}++;
my ($seq_id, $seq_count) = split(/,/,$key);
if ($matches > 2){
foreach my $match (@{$hos{$key}}){
my ($start, $end, $repeat_length, $copies, $file, $n) = split(/,/, $match);
$file =~ s#^\.\/##;
$file =~ s/(.*\.\d{3}\.fa).*/$1/;
print $out "$counter\t$matches\t$file\t$seq_id\t$start\t$end\t$repeat_length\t$copies\n";
}
$counter++;
}
}
close($out);
print "\n";
foreach my $key (sort {$a <=> $b} keys %hos_count){
print "HOS level $key -> $hos_count{$key} occurences found\n";
}
print STDERR "\n# $0 finished at ", `date`, "\n";
exit(0);