Kmer_AMOS.hh

Go to the documentation of this file.
00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 #ifndef __Kmer_AMOS_HH
00011 #define __Kmer_AMOS_HH 1
00012 
00013 #include "Universal_AMOS.hh"
00014 #include <vector>
00015 #include <string>
00016 
00017 
00018 
00019 
00020 namespace AMOS {
00021 
00022 //================================================ Kmer_t ======================
00028 //==============================================================================
00029 class Kmer_t : public Universal_t
00030 {
00031   
00032 private:
00033 
00034   uint8_t * seq_m;              
00035   uint32_t count_m;             
00036   uint8_t length_m;             
00037   std::vector<ID_t> reads_m;    
00038 
00039 
00040 protected:
00041 
00042   static const uint8_t ADENINE_BITS  = 0x0;   
00043   static const uint8_t CYTOSINE_BITS = 0x40;  
00044   static const uint8_t GUANINE_BITS  = 0x80;  
00045   static const uint8_t THYMINE_BITS  = 0xC0;  
00046   static const uint8_t SEQ_BITS      = 0xC0;  
00047 
00048 
00049   //--------------------------------------------------- compress ---------------
00059   static uint8_t compress (char seqchar)
00060   {
00061     switch ( toupper(seqchar) )
00062       {
00063       case 'A': return ADENINE_BITS;
00064       case 'C': return CYTOSINE_BITS;
00065       case 'G': return GUANINE_BITS;
00066       case 'T': return THYMINE_BITS;
00067       default:
00068         AMOS_THROW_ARGUMENT ((std::string)"Invalid Kmer character " + seqchar);
00069       }
00070   }
00071 
00072 
00073   //--------------------------------------------------- uncompress -------------
00082   static char uncompress (uint8_t byte)
00083   {
00084     switch ( byte & SEQ_BITS )
00085       {
00086       case ADENINE_BITS:  return 'A';
00087       case CYTOSINE_BITS: return 'C';
00088       case GUANINE_BITS:  return 'G';
00089       case THYMINE_BITS:  return 'T';
00090       default:
00091         AMOS_THROW_ARGUMENT ("Unknown logic error");
00092       }
00093   }
00094 
00095 
00096   //--------------------------------------------------- readRecord -------------
00097   virtual void readRecord (std::istream & fix, std::istream & var);
00098 
00099 
00100   //--------------------------------------------------- readRecordFix ----------
00101   virtual void readRecordFix (std::istream & fix);
00102 
00103 
00104   //--------------------------------------------------- writeRecord ------------
00105   virtual void writeRecord (std::ostream & fix, std::ostream & var) const;
00106 
00107 
00108 public:
00109 
00110   static const NCode_t NCODE;
00112 
00113   static const uint8_t MAX_LENGTH;
00115 
00116 
00117   //--------------------------------------------------- Kmer_t -----------------
00122   Kmer_t ( )
00123   {
00124     seq_m = NULL;
00125     count_m = length_m = 0;
00126   }
00127 
00128 
00129   //--------------------------------------------------- Kmer_t -----------------
00132   Kmer_t (const Kmer_t & source)
00133   {
00134     seq_m = NULL;
00135     *this = source;
00136   }
00137 
00138 
00139   //--------------------------------------------------- ~Kmer_t ----------------
00144   ~Kmer_t ( )
00145   {
00146     free (seq_m);
00147   }
00148 
00149 
00150   //--------------------------------------------------- clear ------------------
00151   virtual void clear ( );
00152 
00153 
00154   //--------------------------------------------------- getBase ----------------
00164   //  -- developers note --
00165   //  If we imagine consecutive bytes stored left-to-right, then we can index
00166   //  the seqchars left-to-right as follows:
00167   //   [0] [1] [2] [3]   [4] [5] [6] [7]  ... seqchars
00168   //  [7 6 5 4 3 2 1 0] [7 6 5 4 3 2 1 0] ... bits
00169   //  [byte 0         ] [byte 1         ] ... bytes
00170   //  Thus, to retrieve seqchar 5 we need to index byte 1. Since uncompress
00171   //  expects the sequence bits in the high-order end of the byte, we left
00172   //  shift 2-bits. The uncompress method will take care of the necessary
00173   //  masking.
00174   //
00175   char getBase (Pos_t index) const
00176   {
00177     if ( index < 0 || index >= length_m )
00178       AMOS_THROW_ARGUMENT ("Requested kmer index is out of range");
00179     return uncompress ((seq_m [index / 4]) << (index % 4 * 2));
00180   }
00181 
00182 
00183   //--------------------------------------------------- getCount ---------------
00188   uint32_t getCount ( ) const
00189   {
00190     return count_m;
00191   }
00192 
00193 
00194   //--------------------------------------------------- getLength --------------
00199   uint8_t getLength ( ) const
00200   {
00201     return length_m;
00202   }
00203 
00204 
00205   //--------------------------------------------------- getNCode ---------------
00206   virtual NCode_t getNCode ( ) const
00207   {
00208     return Kmer_t::NCODE;
00209   }
00210 
00211 
00212   //--------------------------------------------------- getReads ---------------
00217   const std::vector<ID_t> & getReads ( ) const
00218   {
00219     return reads_m;
00220   }
00221 
00222 
00223   //--------------------------------------------------- getReads ---------------
00228   std::vector<ID_t> & getReads ( )
00229   {
00230     return reads_m;
00231   }
00232 
00233 
00234   //--------------------------------------------------- getSeqString -----------
00241   std::string getSeqString ( ) const;
00242 
00243 
00244   //--------------------------------------------------- readMessage ------------
00245   virtual void readMessage (const Message_t & msg);
00246 
00247 
00248   //--------------------------------------------------- setBase ----------------
00261   //  -- developers note --
00262   //  If we imagine consecutive bytes stored left-to-right, then we can index
00263   //  the seqchars left-to-right as follows:
00264   //   [0] [1] [2] [3]   [4] [5] [6] [7]  ... seqchars
00265   //  [7 6 5 4 3 2 1 0] [7 6 5 4 3 2 1 0] ... bits
00266   //  [byte 0         ] [byte 1         ] ... bytes
00267   //  Thus, to set seqchar 2, we need overwrite the bits at the 2^3 and 2^2
00268   //  positions of byte 0. Since compress returns the sequence bits in the
00269   //  high-order end of the byte, we right-shift the return value 4-bits and
00270   //  "OR" it with the stored byte (making sure to clear those two bits first).
00271   //
00272   void setBase (char seqchar, Pos_t index)
00273   {
00274     if ( index < 0 || index >= length_m )
00275       AMOS_THROW_ARGUMENT ("Requested kmer index is out of range");
00276 
00277     int offset = index % 4 * 2;              // the bitmask offset
00278     uint8_t * seqp = seq_m + index / 4;      // the required byte
00279 
00280     //-- Delete the previous two bits, then set the two new bits
00281     *seqp &= ~(SEQ_BITS >> offset);
00282     *seqp |= compress (seqchar) >> offset;
00283   }
00284 
00285 
00286   //--------------------------------------------------- setCount ---------------
00292   void setCount (uint32_t count)
00293   {
00294     count_m = count;
00295   }
00296 
00297 
00298   //--------------------------------------------------- setReads ---------------
00304   void setReads (const std::vector<ID_t> & reads)
00305   {
00306     reads_m = reads;
00307   }
00308 
00309 
00310   //--------------------------------------------------- setSeqString -----------
00324   void setSeqString (const std::string & seq);
00325 
00326 
00327   //--------------------------------------------------- operator++ -------------
00334   Kmer_t & operator++ (int)
00335   {
00336     count_m ++;
00337     return *this;
00338   }
00339 
00340 
00341   //--------------------------------------------------- writeMessage -----------
00342   virtual void writeMessage (Message_t & msg) const;
00343 
00344 
00345   //--------------------------------------------------- operator= --------------
00353   Kmer_t & operator= (const Kmer_t & source);
00354 };
00355 
00356 } // namespace AMOS
00357 
00358 #endif // #ifndef __Kmer_AMOS_HH

Generated on Mon Feb 22 17:36:27 2010 for libAMOS by  doxygen 1.4.7