00001
00002
00010
00011 #include "Contig_AMOS.hh"
00012 #include <cctype>
00013 using namespace AMOS;
00014 using namespace std;
00015
00016
00017
00018
00019
00020 const NCode_t Contig_t::NCODE = M_CONTIG;
00021
00022
00023
00024 void Contig_t::indexGaps()
00025 {
00026 if (gapsvalid_m) { return; }
00027 gapsvalid_m = true;
00028
00029 gaps_m.clear();
00030
00031 Pos_t len = getLength();
00032
00033 for (Pos_t i = 0; i < len; i++)
00034 {
00035 if (getBase(i).first == '-')
00036 {
00037 gaps_m.push_back(i);
00038 }
00039 }
00040 }
00041
00042
00043
00044 Pos_t Contig_t::gap2ungap (Pos_t gap)
00045 {
00046 indexGaps();
00047
00048 int l = gaps_m.size();
00049
00050 Pos_t retval = gap;
00051
00052 for (int i = 0; (i < l) && (gaps_m[i] <= gap); i++)
00053 {
00054 retval--;
00055 }
00056
00057 retval++;
00058
00059 return retval;
00060 }
00061
00062
00063
00064 Size_t Contig_t::ungap2gap (Pos_t ungap)
00065 {
00066 indexGaps();
00067
00068 int l = gaps_m.size();
00069
00070 Pos_t retval = ungap-1;
00071
00072 for (int i = 0; (i < l) && (gaps_m[i] <= retval); i++)
00073 {
00074 retval++;
00075 }
00076
00077 return retval;
00078 }
00079
00080
00081
00082 Size_t Contig_t::getSpan ( ) const
00083 {
00084 Pos_t hi,lo;
00085
00086
00087 if ( reads_m . empty( ) )
00088 {
00089 lo = hi = 0;
00090 }
00091 else
00092 {
00093 vector<Tile_t>::const_iterator ti = reads_m . begin( );
00094
00095 lo = ti -> offset;
00096 hi = ti -> offset + ti -> range . getLength( );
00097
00098 for ( ++ ti; ti != reads_m . end( ); ++ ti )
00099 {
00100 if ( ti -> offset < lo )
00101 lo = ti -> offset;
00102 if ( ti -> offset + ti -> getGappedLength( ) > hi )
00103 hi = ti -> offset + ti -> getGappedLength( );
00104 }
00105 }
00106
00107 return hi - lo;
00108 }
00109
00110
00111
00112 Size_t Contig_t::getUngappedLength( ) const
00113 {
00114 Size_t retval = length_m;
00115
00116 for (Pos_t i = 0; i < length_m; ++ i )
00117 if ( ! isalpha (getBase (i) . first) )
00118 -- retval;
00119
00120 return retval;
00121 }
00122
00123
00124
00125 string Contig_t::getUngappedQualString (Range_t range) const
00126 {
00127 Pos_t lo = range . getLo( );
00128 Pos_t hi = range . getHi( );
00129
00130
00131 if ( lo < 0 || hi > getLength( ) )
00132 AMOS_THROW_ARGUMENT ("Invalid quality subrange");
00133
00134 string retval;
00135 retval . reserve (hi - lo);
00136 pair<char, char> seqqualc;
00137
00138
00139 for ( ; lo < hi; lo ++ )
00140 {
00141 seqqualc = getBase (lo);
00142 if ( isalpha (seqqualc . first) )
00143 retval . push_back (seqqualc . second);
00144 }
00145
00146 if ( range . isReverse( ) )
00147 AMOS::Reverse (retval);
00148
00149 return retval;
00150 }
00151
00152
00153
00154 string Contig_t::getUngappedSeqString (Range_t range) const
00155 {
00156 Pos_t lo = range . getLo( );
00157 Pos_t hi = range . getHi( );
00158
00159
00160 if ( lo < 0 || hi > getLength( ) )
00161 AMOS_THROW_ARGUMENT ("Invalid sequence subrange");
00162
00163 string retval;
00164 retval . reserve (hi - lo);
00165 char seqc;
00166
00167
00168 for ( ; lo < hi; lo ++ )
00169 {
00170 seqc = getBase (lo) . first;
00171 if ( isalpha (seqc) )
00172 retval . push_back (seqc);
00173 }
00174
00175 if ( range . isReverse( ) )
00176 AMOS::ReverseComplement (retval);
00177
00178 return retval;
00179 }
00180
00181
00182
00183 double Contig_t::getCovStat(const double globalArrivalRate) const
00184 {
00185 const float ln2=0.69314718055994530941723212145818;
00186
00187 assert(globalArrivalRate != -1);
00188 return (getAvgRho() * globalArrivalRate) - (ln2 * (reads_m.size() -1));
00189 }
00190
00191
00192
00193 double Contig_t::getAvgRho( ) const
00194 {
00195 double avgRho = 0;
00196 Pos_t lo, hi;
00197 Size_t lenLo, lenHi;
00198
00199 if ( !reads_m . empty( ) )
00200 {
00201 vector<Tile_t>::const_iterator ti = reads_m . begin( );
00202
00203 lo = ti -> offset;
00204 lenLo = ti->range.getLength();
00205 hi = ti -> offset + ti -> range . getLength( );
00206 lenHi = ti->range.getLength();
00207
00208 for ( ++ ti; ti != reads_m . end( ); ++ ti )
00209 {
00210 if ( ti -> offset < lo )
00211 lo = ti -> offset;
00212 lenLo = ti->range.getLength();
00213 if ( ti -> offset + ti -> range.getLength( ) > hi )
00214 hi = ti -> offset + ti -> range.getLength();
00215 lenHi = ti->range.getLength();
00216 }
00217
00218 double avgLen = (lenLo + lenHi) / 2;
00219 if (avgLen < getLength()) {
00220 avgRho = getLength() - avgLen;
00221 }
00222 }
00223
00224 return avgRho;
00225 }
00226
00227
00228
00229 void Contig_t::insertGapColumn (Pos_t gindex)
00230 {
00231
00232 string qual(getQualString());
00233 string seq(getSeqString());
00234
00235 if (gindex > seq.length())
00236 {
00237 AMOS_THROW_ARGUMENT("Invalid position specified for inserting gap column");
00238 }
00239
00240 seq.insert(gindex, "-", 1);
00241 qual.insert(gindex, "N", 1);
00242 setSequence(seq, qual);
00243
00244
00245 vector<Tile_t>::iterator i;
00246 for (i = reads_m.begin();
00247 i != reads_m.end();
00248 i++)
00249 {
00250 if (i->offset >= gindex)
00251 {
00252
00253
00254 i->offset++;
00255 }
00256 else if (i->getRightOffset() < gindex)
00257 {
00258
00259 }
00260 else
00261 {
00262
00263 int gseqpos = gindex - i->offset;
00264
00265
00266 vector<Pos_t> newgaps;
00267
00268
00269 int gapcount = 0;
00270 vector<Pos_t>::iterator g;
00271 for (g = i->gaps.begin();
00272 g != i->gaps.end();
00273 g++, gapcount++)
00274 {
00275 int cgseqpos = *g+gapcount;
00276 if (cgseqpos > gseqpos) { break; }
00277
00278 newgaps.push_back(*g);
00279 }
00280
00281
00282 newgaps.push_back(gseqpos-gapcount);
00283
00284
00285 while (g != i->gaps.end())
00286 {
00287 newgaps.push_back(*g);
00288 g++;
00289 }
00290
00291 i->gaps = newgaps;
00292 }
00293 }
00294 }
00295
00296
00297
00298
00299 void Contig_t::readMessage (const Message_t & msg)
00300 {
00301 gapsvalid_m = false;
00302 Sequence_t::readMessage (msg);
00303
00304 try {
00305 vector<Message_t>::const_iterator i;
00306
00307 for ( i = msg . getSubMessages( ) . begin( );
00308 i != msg . getSubMessages( ) . end( ); i ++ )
00309 {
00310 if ( i -> getMessageCode( ) == M_TILE )
00311 {
00312 reads_m . push_back (Tile_t( ));
00313 reads_m . back( ) . readMessage (*i);
00314 }
00315 }
00316 }
00317 catch (ArgumentException_t) {
00318
00319 clear( );
00320 throw;
00321 }
00322
00323 }
00324
00325
00326
00327 void Contig_t::readRecord (istream & fix, istream & var)
00328 {
00329 gapsvalid_m = false;
00330 Sequence_t::readRecord (fix, var);
00331
00332 Size_t sizet;
00333 readLE (fix, &sizet);
00334
00335 reads_m . resize (sizet);
00336 for ( Pos_t i = 0; i < sizet; i ++ )
00337 reads_m [i] . readRecord (var);
00338 }
00339
00340
00341
00342 void Contig_t::readRecordFix (istream & fix)
00343 {
00344 gapsvalid_m = false;
00345 Sequence_t::readRecordFix (fix);
00346
00347 Size_t sizet;
00348 readLE (fix, &sizet);
00349
00350 reads_m.clear();
00351 reads_m . resize (sizet);
00352 }
00353
00354
00355
00356 bool Contig_t::readUMD (istream & in)
00357 {
00358 gapsvalid_m = false;
00359 string eid;
00360 Tile_t tile;
00361 istringstream ss;
00362 string line;
00363
00364 while ( line . empty( ) || line [0] != 'C' )
00365 {
00366 if ( !in . good( ) )
00367 return false;
00368 getline (in, line);
00369 }
00370
00371 clear( );
00372
00373 try {
00374
00375 ss . str (line);
00376 ss . ignore( );
00377 ss >> eid;
00378 if ( !ss )
00379 AMOS_THROW_ARGUMENT ("Invalid contig ID");
00380 ss . clear( );
00381 setEID (eid);
00382
00383 while ( true )
00384 {
00385 getline (in, line);
00386 if ( line . empty( ) )
00387 break;
00388
00389 ss . str (line);
00390 ss >> tile . source;
00391 ss >> tile . range . begin;
00392 ss >> tile . range . end;
00393 if ( !ss )
00394 AMOS_THROW_ARGUMENT ("Invalid read layout");
00395 ss . clear( );
00396 tile . offset = tile . range . begin < tile . range . end ?
00397 tile . range . begin : tile . range . end;
00398 tile . range . begin -= tile . offset;
00399 tile . range . end -= tile . offset;
00400 getReadTiling( ) . push_back (tile);
00401 }
00402
00403 }
00404 catch (IOException_t) {
00405
00406
00407 clear( );
00408 throw;
00409 }
00410
00411 return true;
00412 }
00413
00414 void Contig_t::reverseComplement()
00415 {
00416
00417 string qual(getQualString());
00418 string seq(getSeqString());
00419 int conslen = seq.length();
00420
00421 AMOS::ReverseComplement(seq);
00422 reverse(qual.begin(), qual.end());
00423 setSequence(seq, qual);
00424
00425
00426 vector<Tile_t>::iterator i;
00427 for (i = reads_m.begin();
00428 i != reads_m.end();
00429 i++)
00430 {
00431 i->range.swap();
00432 i->offset = conslen - (i->offset + i->getGappedLength());
00433
00434 Pos_t len = i->range.getLength();
00435
00436 vector<Pos_t>::iterator g;
00437 for (g = i->gaps.begin();
00438 g != i->gaps.end();
00439 g++)
00440 {
00441 *g = len - *g;
00442 }
00443
00444 reverse(i->gaps.begin(), i->gaps.end());
00445 }
00446 }
00447
00448
00449
00450 void Contig_t::writeMessage (Message_t & msg) const
00451 {
00452 Sequence_t::writeMessage (msg);
00453
00454 try {
00455 Pos_t begin = msg . getSubMessages( ) . size( );
00456 msg . getSubMessages( ) . resize (begin + reads_m . size( ));
00457
00458 msg . setMessageCode (Contig_t::NCODE);
00459
00460 if ( !reads_m . empty( ) )
00461 {
00462 vector<Tile_t>::const_iterator tvi;
00463 for ( tvi = reads_m . begin( ); tvi != reads_m . end( ); ++ tvi )
00464 tvi -> writeMessage (msg . getSubMessages( ) [begin ++]);
00465 }
00466 }
00467 catch (ArgumentException_t) {
00468
00469 msg . clear( );
00470 throw;
00471 }
00472 }
00473
00474
00475
00476 void Contig_t::writeRecord (ostream & fix, ostream & var) const
00477 {
00478 Sequence_t::writeRecord (fix, var);
00479
00480 Size_t sizet = reads_m . size( );
00481 writeLE (fix, &sizet);
00482
00483 for ( Pos_t i = 0; i < sizet; i ++ )
00484 reads_m [i] . writeRecord (var);
00485 }
00486
00487
00488
00489 void Contig_t::writeUMD (ostream & out) const
00490 {
00491 vector<Tile_t>::const_iterator ti;
00492
00493 out << "C " << getEID( ) << endl;
00494
00495 for ( ti = reads_m . begin( ); ti != reads_m . end( ); ti ++ )
00496 out << ti -> source << ' '
00497 << ti -> range . begin + ti -> offset << ' '
00498 << ti -> range . end + ti -> offset << endl;
00499
00500 out << endl;
00501
00502 if ( !out . good( ) )
00503 AMOS_THROW_IO ("UMD message write failure");
00504 }