Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

Kmer_AMOS.hh

Go to the documentation of this file.
00001 00002 00003 00004 00005 00006 00007 00008 00009 00010 #ifndef __Kmer_AMOS_HH 00011 #define __Kmer_AMOS_HH 1 00012 00013 #include "Universal_AMOS.hh" 00014 #include <vector> 00015 #include <string> 00016 00017 00018 00019 00020 namespace AMOS { 00021 00022 //================================================ Kmer_t ====================== 00028 //============================================================================== 00029 class Kmer_t : public Universal_t 00030 { 00031 00032 private: 00033 00034 uint8_t * seq_m; 00035 uint32_t count_m; 00036 uint8_t length_m; 00037 std::vector<ID_t> reads_m; 00038 00039 00040 protected: 00041 00042 static const uint8_t ADENINE_BITS = 0x0; 00043 static const uint8_t CYTOSINE_BITS = 0x40; 00044 static const uint8_t GUANINE_BITS = 0x80; 00045 static const uint8_t THYMINE_BITS = 0xC0; 00046 static const uint8_t SEQ_BITS = 0xC0; 00047 00048 00049 //--------------------------------------------------- compress --------------- 00059 static uint8_t compress (char seqchar) 00060 { 00061 switch ( toupper(seqchar) ) 00062 { 00063 case 'A': return ADENINE_BITS; 00064 case 'C': return CYTOSINE_BITS; 00065 case 'G': return GUANINE_BITS; 00066 case 'T': return THYMINE_BITS; 00067 default: 00068 AMOS_THROW_ARGUMENT ((std::string)"Invalid Kmer character " + seqchar); 00069 } 00070 } 00071 00072 00073 //--------------------------------------------------- uncompress ------------- 00082 static char uncompress (uint8_t byte) 00083 { 00084 switch ( byte & SEQ_BITS ) 00085 { 00086 case ADENINE_BITS: return 'A'; 00087 case CYTOSINE_BITS: return 'C'; 00088 case GUANINE_BITS: return 'G'; 00089 case THYMINE_BITS: return 'T'; 00090 default: 00091 AMOS_THROW_ARGUMENT ("Unknown logic error"); 00092 } 00093 } 00094 00095 00096 //--------------------------------------------------- readRecord ------------- 00097 virtual void readRecord (std::istream & fix, std::istream & var); 00098 00099 00100 //--------------------------------------------------- writeRecord ------------ 00101 virtual void writeRecord (std::ostream & fix, std::ostream & var) const; 00102 00103 00104 public: 00105 00106 static const NCode_t NCODE; 00108 00109 static const uint8_t MAX_LENGTH; 00111 00112 00113 //--------------------------------------------------- Kmer_t ----------------- 00118 Kmer_t ( ) 00119 { 00120 seq_m = NULL; 00121 count_m = length_m = 0; 00122 } 00123 00124 00125 //--------------------------------------------------- Kmer_t ----------------- 00128 Kmer_t (const Kmer_t & source) 00129 { 00130 seq_m = NULL; 00131 *this = source; 00132 } 00133 00134 00135 //--------------------------------------------------- ~Kmer_t ---------------- 00140 ~Kmer_t ( ) 00141 { 00142 free (seq_m); 00143 } 00144 00145 00146 //--------------------------------------------------- clear ------------------ 00147 virtual void clear ( ); 00148 00149 00150 //--------------------------------------------------- getBase ---------------- 00160 // -- developers note -- 00161 // If we imagine consecutive bytes stored left-to-right, then we can index 00162 // the seqchars left-to-right as follows: 00163 // [0] [1] [2] [3] [4] [5] [6] [7] ... seqchars 00164 // [7 6 5 4 3 2 1 0] [7 6 5 4 3 2 1 0] ... bits 00165 // [byte 0 ] [byte 1 ] ... bytes 00166 // Thus, to retrieve seqchar 5 we need to index byte 1. Since uncompress 00167 // expects the sequence bits in the high-order end of the byte, we left 00168 // shift 2-bits. The uncompress method will take care of the necessary 00169 // masking. 00170 // 00171 char getBase (Pos_t index) const 00172 { 00173 if ( index < 0 || index >= length_m ) 00174 AMOS_THROW_ARGUMENT ("Requested kmer index is out of range"); 00175 return uncompress ((seq_m [index / 4]) << (index % 4 * 2)); 00176 } 00177 00178 00179 //--------------------------------------------------- getCount --------------- 00184 uint32_t getCount ( ) const 00185 { 00186 return count_m; 00187 } 00188 00189 00190 //--------------------------------------------------- getLength -------------- 00195 uint8_t getLength ( ) const 00196 { 00197 return length_m; 00198 } 00199 00200 00201 //--------------------------------------------------- getNCode --------------- 00202 virtual NCode_t getNCode ( ) const 00203 { 00204 return Kmer_t::NCODE; 00205 } 00206 00207 00208 //--------------------------------------------------- getReads --------------- 00213 const std::vector<ID_t> & getReads ( ) const 00214 { 00215 return reads_m; 00216 } 00217 00218 00219 //--------------------------------------------------- getReads --------------- 00224 std::vector<ID_t> & getReads ( ) 00225 { 00226 return reads_m; 00227 } 00228 00229 00230 //--------------------------------------------------- getSeqString ----------- 00237 std::string getSeqString ( ) const; 00238 00239 00240 //--------------------------------------------------- readMessage ------------ 00241 virtual void readMessage (const Message_t & msg); 00242 00243 00244 //--------------------------------------------------- setBase ---------------- 00257 // -- developers note -- 00258 // If we imagine consecutive bytes stored left-to-right, then we can index 00259 // the seqchars left-to-right as follows: 00260 // [0] [1] [2] [3] [4] [5] [6] [7] ... seqchars 00261 // [7 6 5 4 3 2 1 0] [7 6 5 4 3 2 1 0] ... bits 00262 // [byte 0 ] [byte 1 ] ... bytes 00263 // Thus, to set seqchar 2, we need overwrite the bits at the 2^3 and 2^2 00264 // positions of byte 0. Since compress returns the sequence bits in the 00265 // high-order end of the byte, we right-shift the return value 4-bits and 00266 // "OR" it with the stored byte (making sure to clear those two bits first). 00267 // 00268 void setBase (char seqchar, Pos_t index) 00269 { 00270 if ( index < 0 || index >= length_m ) 00271 AMOS_THROW_ARGUMENT ("Requested kmer index is out of range"); 00272 00273 int offset = index % 4 * 2; // the bitmask offset 00274 uint8_t * seqp = seq_m + index / 4; // the required byte 00275 00276 //-- Delete the previous two bits, then set the two new bits 00277 *seqp &= ~(SEQ_BITS >> offset); 00278 *seqp |= compress (seqchar) >> offset; 00279 } 00280 00281 00282 //--------------------------------------------------- setCount --------------- 00288 void setCount (uint32_t count) 00289 { 00290 count_m = count; 00291 } 00292 00293 00294 //--------------------------------------------------- setReads --------------- 00300 void setReads (const std::vector<ID_t> & reads) 00301 { 00302 reads_m = reads; 00303 } 00304 00305 00306 //--------------------------------------------------- setSeqString ----------- 00320 void setSeqString (const std::string & seq); 00321 00322 00323 //--------------------------------------------------- operator++ ------------- 00330 Kmer_t & operator++ (int) 00331 { 00332 count_m ++; 00333 return *this; 00334 } 00335 00336 00337 //--------------------------------------------------- writeMessage ----------- 00338 virtual void writeMessage (Message_t & msg) const; 00339 00340 00341 //--------------------------------------------------- operator= -------------- 00349 Kmer_t & operator= (const Kmer_t & source); 00350 }; 00351 00352 } // namespace AMOS 00353 00354 #endif // #ifndef __Kmer_AMOS_HH

Generated on Tue May 17 15:19:01 2005 for libAMOS by doxygen 1.3.8