00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef __Sequence_AMOS_HH
00011 #define __Sequence_AMOS_HH 1
00012
00013 #include "Universal_AMOS.hh"
00014 #include <string>
00015
00016
00017
00018
00019 namespace AMOS {
00020
00021
00035
00036 class Sequence_t : public Universal_t
00037 {
00038
00039 protected:
00040
00041 uint8_t * seq_m;
00042 uint8_t * qual_m;
00043 Size_t length_m;
00044
00045
00046 static const uint8_t COMPRESS_BIT = 0x1;
00047 static const uint8_t ADENINE_BITS = 0x0;
00048 static const uint8_t CYTOSINE_BITS = 0x40;
00049 static const uint8_t GUANINE_BITS = 0x80;
00050 static const uint8_t THYMINE_BITS = 0xC0;
00051 static const uint8_t SEQ_BITS = 0xC0;
00052 static const uint8_t QUAL_BITS = 0x3F;
00053
00054
00055
00064 static inline uint8_t compress (char seqchar, char qualchar)
00065 {
00066
00067 qualchar = Char2Qual (qualchar);
00068
00069 if ( qualchar & SEQ_BITS )
00070 return 0;
00071
00072 switch ( toupper (seqchar) )
00073 {
00074 case 'A': return (uint8_t)qualchar | ADENINE_BITS;
00075 case 'C': return (uint8_t)qualchar | CYTOSINE_BITS;
00076 case 'G': return (uint8_t)qualchar | GUANINE_BITS;
00077 case 'T': return (uint8_t)qualchar | THYMINE_BITS;
00078 case 'N': return 0;
00079 default:
00080 return 0;
00081 }
00082 }
00083
00084
00085
00093 static inline std::pair<char, char> uncompress (uint8_t byte)
00094 {
00095 std::pair<char, char> retval;
00096
00097 switch ( byte & SEQ_BITS )
00098 {
00099 case ADENINE_BITS: retval . first = 'A'; break;
00100 case CYTOSINE_BITS: retval . first = 'C'; break;
00101 case GUANINE_BITS: retval . first = 'G'; break;
00102 case THYMINE_BITS: retval . first = 'T'; break;
00103 }
00104
00105 byte &= QUAL_BITS;
00106 if ( byte == 0 )
00107 retval . first = 'N';
00108
00109 retval . second = Qual2Char (byte);
00110
00111 return retval;
00112 }
00113
00114
00115
00116 virtual void readRecord (std::istream & fix, std::istream & var);
00117
00118
00119 virtual void readRecordFix (std::istream & fix);
00120
00121
00122 virtual void writeRecord (std::ostream & fix, std::ostream & var) const;
00123
00124
00125 public:
00126
00127 static const NCode_t NCODE;
00129
00130
00131
00136 Sequence_t ( )
00137 {
00138 seq_m = qual_m = NULL;
00139 length_m = 0;
00140 }
00141
00142
00143
00146 Sequence_t (const Sequence_t & source)
00147 {
00148 seq_m = qual_m = NULL;
00149 *this = source;
00150 }
00151
00152
00153
00158 virtual ~Sequence_t ( )
00159 {
00160 free (seq_m);
00161 free (qual_m);
00162 }
00163
00164
00165
00171 virtual void clear ( );
00172
00173
00174
00192 void compress ( );
00193
00194
00195
00205 std::pair<char, char> getBase (Pos_t index) const
00206 {
00207 if (seq_m == NULL)
00208 AMOS_THROW_ARGUMENT("No sequence data");
00209
00210 if ( index < 0 || index >= length_m )
00211 AMOS_THROW_ARGUMENT ("Requested sequence index is out of range");
00212
00213 if ( isCompressed( ) )
00214 return uncompress (seq_m [index]);
00215 else
00216 return std::make_pair ((char)(seq_m [index]), (char)(qual_m [index]));
00217 }
00218
00219
00220
00221
00226 virtual double getGCContent (const Range_t & rng) const
00227 {
00228 int gc = 0;
00229 int all = 0;
00230
00231 int start = rng.getLo();
00232 int stop = rng.getHi();
00233
00234
00235
00236 for (int i = start; i < stop; i++)
00237 {
00238 switch(getBase(i).first)
00239 {
00240 case 'A':
00241 case 'a':
00242 case 'T':
00243 case 't': all++; break;
00244
00245 case 'C':
00246 case 'c':
00247 case 'G':
00248 case 'g': all++; gc++; break;
00249 };
00250 }
00251
00252 return (all) ? ((double)gc)/all : 0.0;
00253 }
00254
00255
00260 virtual double getGCContent ( ) const
00261 {
00262 return getGCContent(Range_t(0, length_m));
00263 }
00264
00265
00266
00267
00272 Size_t getLength ( ) const
00273 {
00274 return length_m;
00275 }
00276
00277
00278
00279 virtual NCode_t getNCode ( ) const
00280 {
00281 return Sequence_t::NCODE;
00282 }
00283
00284
00285
00290 std::string getQualString ( ) const
00291 {
00292 return getQualString (Range_t (0, length_m));
00293 }
00294
00295
00296
00307 std::string getQualString (Range_t range) const;
00308
00309
00310
00315 std::string getSeqString ( ) const
00316 {
00317 return getSeqString (Range_t (0, length_m));
00318 }
00319
00320
00321
00332 std::string getSeqString (Range_t range) const;
00333
00334
00335
00343 bool isCompressed ( ) const
00344 {
00345 return flags_m . nibble & COMPRESS_BIT;
00346 }
00347
00348
00349
00350 virtual void readMessage (const Message_t & msg);
00351
00352
00353
00373 void setBase (char seqchar, char qualchar, Pos_t index)
00374 {
00375 if (seq_m == NULL)
00376 AMOS_THROW_ARGUMENT("No sequence data");
00377
00378 if ( index < 0 || index >= length_m )
00379 AMOS_THROW_ARGUMENT ("Requested sequence index is out of range");
00380
00381 if ( isCompressed( ) )
00382 seq_m [index] = compress (seqchar, qualchar);
00383 else
00384 {
00385 seq_m [index] = seqchar;
00386 qual_m [index] = qualchar;
00387 }
00388 }
00389
00390
00391
00406 void setSequence (const char * seq, const char * qual);
00407
00408
00409
00424 void setSequence (const std::string & seq, const std::string & qual);
00425
00426
00427
00437 void uncompress ( );
00438
00439
00440
00448 Sequence_t & operator= (const Sequence_t & source);
00449
00450
00451
00452 virtual void writeMessage (Message_t & msg) const;
00453
00454 };
00455
00456 }
00457
00458 #endif // #ifndef __Sequence_AMOS_HH