Main Page   Class Hierarchy   Alphabetical List   Compound List   Examples  

itparser.h

00001 #ifndef _MIMETIC_PARSER_ITPARSER_H_
00002 #define _MIMETIC_PARSER_ITPARSER_H_
00003 #include <iterator>
00004 #include <algorithm>
00005 #include <stack>
00006 #include <iostream>
00007 #include <mimetic/tree.h>
00008 #include <mimetic/utils.h>
00009 #include <mimetic/mimeentity.h>
00010 
00011 
00012 // FIXME: handle HigherLevelClosingBoundary
00013 
00014 namespace mimetic
00015 {
00016 
00017 /// Parse the input reading from an iterator
00018 template<typename Iterator, 
00019 typename ItCategory=typename std::iterator_traits<Iterator>::iterator_category> 
00020 struct IteratorParser
00021 {
00022 };
00023 
00024 /*
00025  * Input Iterator
00026  */
00027 template<typename Iterator>
00028 struct IteratorParser<Iterator, std::input_iterator_tag>
00029 {
00030 
00031     IteratorParser(MimeEntity& me)
00032     : m_me(me), m_iMask(imNone), m_lastBoundary(NoBoundary)
00033     {
00034         m_entityStack.push(&m_me);
00035     }
00036     virtual ~IteratorParser()
00037     {
00038     }
00039     /**
00040      * set the Ignore Mask to \p mask
00041      */
00042     void iMask(size_t mask)    {    m_iMask = mask;        }
00043     /**
00044      * get the Ignore Mask 
00045      */
00046     size_t iMask() const    {    return m_iMask;        }
00047     /**
00048      * start parsing
00049      */
00050     void run(Iterator bit, Iterator eit)
00051     {
00052         m_bit = bit;
00053         m_eit = eit;
00054         doLoad();
00055     }
00056 protected:
00057     typedef std::list<std::string> BoundaryList;
00058     enum { 
00059         CR = 0xD, 
00060         LF = 0xA, 
00061         NL = '\n' 
00062     };
00063     enum /* ParsingElem */ { 
00064         peIgnore, 
00065         pePreamble, 
00066         peBody, 
00067         peEpilogue 
00068     };
00069     enum BoundaryType {
00070         NoBoundary = 0,
00071         Boundary,
00072         ClosingBoundary,
00073         HigherLevelBoundary
00074         //, HigherLevelClosingBoundary
00075     };
00076     enum EntityType { 
00077         etRfc822, 
00078         etMsgRfc822, 
00079         etMultipart 
00080     };
00081     // vars
00082     MimeEntity& m_me;
00083     Iterator m_bit, m_eit;
00084     size_t m_iMask; // ignore mask
00085     BoundaryList m_boundaryList;
00086     BoundaryType m_lastBoundary;
00087     std::stack<MimeEntity*> m_entityStack;
00088 
00089 protected:
00090     void appendPreambleBlock(const char* buf, int sz)
00091     {
00092         MimeEntity* pMe = m_entityStack.top();
00093         pMe->body().preamble().append(buf,sz);
00094     }
00095     
00096     void appendEpilogueBlock(const char* buf, int sz)
00097     {
00098         MimeEntity* pMe = m_entityStack.top();
00099         pMe->body().epilogue().append(buf,sz);
00100     }
00101     
00102     void appendBodyBlock(const char* buf, int sz)
00103     {
00104         MimeEntity* pMe = m_entityStack.top();
00105         pMe->body().append(buf, sz);
00106     }
00107     
00108     std::string getBoundary()
00109     {
00110         const MimeEntity* pMe = m_entityStack.top();
00111         const ContentType& ct = pMe->header().contentType();
00112         return std::string("--") + ct.param("boundary");
00113     }
00114     
00115     void popChild()
00116     {
00117         m_entityStack.pop();
00118     }
00119     
00120     void pushNewChild()
00121     {
00122         MimeEntity* pMe = m_entityStack.top();
00123         MimeEntity* pChild = new MimeEntity;
00124         pMe->body().parts().push_back(pChild);
00125         m_entityStack.push(pChild);
00126     }
00127     
00128     EntityType getType()
00129     {
00130         MimeEntity* pMe = m_entityStack.top();
00131         const Header& h = pMe->header();
00132         // will NOT be automatically created if it doesn't exists;
00133         // null ContentType will be returned
00134         const ContentType& ct = h.contentType();
00135         if(ct.isMultipart())
00136             return etMultipart;
00137         else if    (ct.type() == "message" && ct.subtype() == "rfc822") 
00138             return etMsgRfc822;
00139         else
00140             return etRfc822;
00141     }
00142     
00143     void addField(const std::string& name, const std::string& value)
00144     {
00145         MimeEntity* pMe = m_entityStack.top();
00146         Header& h = pMe->header();
00147         Header::iterator it = h.insert(h.end(), Field());
00148         it->name(name);
00149         it->value(value);
00150     }
00151 
00152     BoundaryType isBoundary(const std::string& line) 
00153     {
00154         if(line.length() == 0 || line[0] != '-')
00155             return m_lastBoundary = NoBoundary;
00156 
00157         int level = 0; // multipart nesting level
00158         int lineLen = line.length();
00159         BoundaryList::const_iterator bit,eit;
00160         bit = m_boundaryList.begin(), eit = m_boundaryList.end();
00161         for(;bit != eit; ++bit, ++level)
00162         {
00163             const std::string& b = *bit;
00164             int bLen = b.length();
00165             if(line.compare(0, bLen, b) == 0)
00166             { 
00167                 // not the expected boundary, malformed msg
00168                 if(level > 0)
00169                     return m_lastBoundary=HigherLevelBoundary;
00170                 // plain boundary or closing boundary?
00171                 if(lineLen > bLen && line.compare(bLen,2,"--") == 0)
00172                     return m_lastBoundary = ClosingBoundary;
00173                 else
00174                     return m_lastBoundary = Boundary;
00175             }
00176         }
00177         return m_lastBoundary = NoBoundary;
00178     }
00179     // is new line
00180     inline bool isnl(char c) const
00181     {
00182         return (c == CR || c == LF);
00183     }
00184     // is a two char newline
00185     inline bool isnl(char a, char b) const
00186     {
00187         if(a == CR || a == LF)
00188             if(b == (a == CR ? LF : CR))
00189                 return true;
00190         return false;
00191     }
00192     void doLoad()
00193     {
00194         loadHeader();
00195         loadBody();
00196     }
00197     bool valid() const
00198     {
00199         return m_bit != m_eit;
00200     }
00201     void append(char*& buf, size_t& bufsz, char c, size_t& pos)
00202     {
00203         enum { alloc_block = 128};
00204         if(pos == bufsz) 
00205         {
00206             // allocate and init buffer
00207             char* tmp = buf;
00208             int oldBufsz = bufsz;
00209             while(pos >= bufsz)
00210                 bufsz = bufsz + alloc_block;
00211             buf = new char[bufsz+1];    
00212             if(tmp != 0)
00213             {
00214                 assert(oldBufsz > 0);
00215                 memset(buf, 0, bufsz);
00216                 memcpy(buf, tmp, oldBufsz);
00217                 delete[] tmp;
00218             }
00219         }
00220         buf[pos++] = c;
00221     }
00222     // parses the header and calls addField and pushChild
00223     // to add fields and nested entities
00224     void loadHeader()
00225     {
00226         enum { 
00227             sInit,
00228             sIgnoreLine,
00229             sNewline,
00230             sWaitingName, 
00231             sWaitingValue, 
00232             sWaitingFoldedValue,
00233             sName, 
00234             sValue,
00235             sIgnoreHeader
00236         };
00237         register int status;
00238         int pos;
00239         char *name, *value;
00240         size_t nBufSz, vBufSz, nPos, vPos;
00241         char prev, c = 0;
00242 
00243         name = value = 0;
00244         pos = nBufSz = vBufSz = nPos = vPos = 0;
00245         status = (m_iMask & imHeader ? sIgnoreHeader : sInit);
00246         //status = sInit;
00247         while(m_bit != m_eit)
00248         {
00249             c = *m_bit;
00250             switch(status)
00251             {
00252             case sInit:
00253                 if(isnl(c))
00254                     status = sNewline;
00255                 else
00256                     status = sName;
00257                 continue;
00258             case sIgnoreLine:
00259                 if(!isnl(c))
00260                     break;
00261                 status = sNewline;
00262                 continue;
00263             case sNewline:
00264                 status = sWaitingName;
00265                 if(pos > 0)
00266                 {
00267                     pos = 0;
00268                     prev = c;
00269                     if(++m_bit == m_eit) goto out; //eof
00270                     c = *m_bit;
00271                     if(c == (prev == CR ? LF : CR))
00272                     {
00273                         --pos;
00274                         break;
00275                     } else 
00276                         continue;
00277                 } else {
00278                     // empty line, end of header
00279                     prev = c;
00280                     if(++m_bit == m_eit) goto out; //eof
00281                     c = *m_bit;
00282                     if(c == (prev == CR ? LF : CR))
00283                         ++m_bit;    
00284                     goto out;
00285                 }
00286             case sWaitingName:
00287                 if(isblank(c))
00288                 {
00289                     // folded value
00290                     status = sWaitingFoldedValue;
00291                     continue;
00292                 } 
00293                 // not blank, new field or empty line 
00294                 if(nPos)
00295                 {
00296                     name[nPos] = 0;
00297                     // is not an empty field (name: \n)
00298                     if(vPos) 
00299                     {
00300                         value[vPos] = 0;
00301                         addField(name,value);
00302                     } else
00303                         addField(name,"");
00304                     nPos = vPos = 0;
00305                 }
00306                 status = (isnl(c) ? sNewline : sName);
00307                 continue;
00308             case sWaitingValue:
00309                 if(isblank(c) || isnl(c))
00310                     break; // eat leading blanks
00311                 status = sValue;
00312                 continue;
00313             case sWaitingFoldedValue:
00314                 if(isblank(c))
00315                     break; // eat leading blanks
00316                 append(value, vBufSz, ' ', vPos);
00317                 status = sValue;
00318                 continue;
00319             case sName:
00320                 if(c > 32 && c < 127 && c != ':') {
00321                     append(name, nBufSz, c, nPos);
00322                 } else if(c == ':') {
00323                     status = sWaitingValue;
00324                 } else {
00325                     nPos = 0;
00326                     status = sIgnoreLine;
00327                     continue;
00328                 }
00329                 break;
00330             case sValue:
00331                 if(isnl(c))
00332                 {
00333                     status = sNewline;
00334                     continue;
00335                 }
00336                 append(value, vBufSz, c, vPos);
00337                 break;
00338             case sIgnoreHeader:
00339                 if(isnl(c))
00340                 {
00341                     prev = c;
00342                     if(++m_bit == m_eit) goto out; //eof
00343                     c = *m_bit;
00344                     if(c == (prev == CR ? LF : CR))
00345                         ++m_bit;    
00346                     if(pos == 0)    
00347                         goto out; //empty line, eoh
00348                     pos = 0;
00349                     continue;
00350                 } 
00351                 break;
00352             }
00353             ++m_bit; ++pos;
00354         }
00355     out:
00356         if(name)
00357             delete[] name;
00358         if(value)
00359             delete[] value;
00360         return;
00361     }
00362     void loadBody()
00363     {
00364         switch(getType())
00365         {
00366         case etRfc822:
00367             if(m_iMask & imBody)
00368                 jump_to_next_boundary();
00369             else
00370                 copy_until_boundary(peBody);
00371             break;
00372         case etMultipart:
00373             loadMultipart();
00374             break;
00375         case etMsgRfc822:
00376             if(m_iMask & imChildParts)
00377                 jump_to_next_boundary();
00378             else {
00379                 pushNewChild();
00380                 doLoad(); // load child entities
00381                 popChild();
00382             }
00383             break;
00384         }
00385     }
00386     void loadMultipart()
00387     {
00388         std::string boundary = getBoundary();
00389         m_boundaryList.push_front(boundary);
00390         ParsingElem pe;
00391         // preamble
00392         pe = (m_iMask & imPreamble ? peIgnore : pePreamble );
00393         copy_until_boundary(pe);
00394         while(m_bit != m_eit)
00395         {
00396             switch(m_lastBoundary)
00397             {
00398             case NoBoundary:
00399                 return; // eof
00400             case Boundary:
00401                 if(m_iMask & imChildParts)
00402                     jump_to_next_boundary();
00403                 else {
00404                     pushNewChild();
00405                     doLoad();
00406                     popChild();
00407                 }
00408                 break;
00409             case ClosingBoundary:
00410                 m_boundaryList.erase(m_boundaryList.begin());
00411                 // epilogue
00412                 pe=(m_iMask & imEpilogue? peIgnore: peEpilogue);
00413                 copy_until_boundary(pe);
00414                 return;
00415             case HigherLevelBoundary:
00416                 m_boundaryList.erase(m_boundaryList.begin());
00417                 return;
00418             }
00419         }
00420     }
00421     inline void onBlock(const char* block, int sz, ParsingElem pe)
00422     {
00423         switch(pe)
00424         {
00425         case peIgnore:
00426             return;
00427         case pePreamble:
00428             appendPreambleBlock(block, sz);
00429             break;
00430         case peEpilogue:
00431             appendEpilogueBlock(block, sz);
00432             break;
00433         case peBody:
00434             appendBodyBlock(block, sz);
00435             break;
00436         }
00437     }
00438     void jump_to_next_boundary()
00439     {
00440         copy_until_boundary(peIgnore);
00441     }
00442     // this is where most of execution time is spent when parsing
00443     // large messages; I'm using a plain char[] buffer instead of
00444     // std::string because I want to be as fast as possible here
00445     virtual void copy_until_boundary(ParsingElem pe)
00446     {
00447         size_t pos, lines, eomsz = 0;
00448         register char c;
00449         enum { nlsz = 1 };
00450         const char *eom = 0;
00451 
00452         enum { blksz = 4096 };
00453         char block[blksz];
00454         size_t blkpos = 0;
00455         size_t sl_off = 0; // start of line offset into *block
00456 
00457         pos = lines = 0;
00458         while(m_bit != m_eit)
00459         {
00460             // if buffer is full
00461             if(blkpos >= blksz - 2 - nlsz)
00462             {
00463                 if(sl_off == 0)
00464                 { 
00465                     // very long line found, assume it 
00466                     // can't be a boundary and flush the buf
00467                     // with the partial line
00468                     block[blkpos] = 0;
00469                     onBlock(block, blkpos, pe);
00470                     blkpos = sl_off = 0;
00471                 } else {
00472                     // flush the buffer except the last
00473                     // (probably incomplete) line
00474                     size_t llen = blkpos - sl_off;
00475                     onBlock(block, sl_off, pe);
00476                     memmove(block, block + sl_off, llen);
00477                     sl_off = 0;
00478                     blkpos = llen;
00479                 }
00480             }
00481             c = *m_bit;
00482             if(isnl(c))
00483             {
00484                 char nlbuf[3] = { 0, 0, 0 };
00485 
00486                 nlbuf[0] = c; // save the current NL char in nlbuf
00487 
00488                 // save the second char of the NL sequence (if any) in nlbuf
00489                 if(++m_bit != m_eit) 
00490                 {
00491                     char next = *m_bit;
00492                     if(next == (c == CR ? LF : CR))
00493                     {
00494                         nlbuf[1] = next; // save the next char in the NL seq
00495                         ++m_bit;
00496                     }
00497                 }
00498 
00499                 if(pos)
00500                 {
00501                     // not an empty row, is this a boundary?
00502                     block[blkpos] = 0;
00503                     if(block[sl_off] == '-' && sl_off < blkpos &&
00504                          block[sl_off+1] == '-')
00505                     {
00506                         std::string Line(block+sl_off, blkpos-sl_off);
00507                         if(isBoundary(Line))
00508                         {
00509                             // trim last newline
00510                             if (sl_off>=2) 
00511                             {
00512                                 int i = sl_off;
00513                                 char a = block[--i];
00514                                 char b = block[--i];
00515 
00516                                 if(isnl(a,b))
00517                                     sl_off -= 2;
00518                                 else if(isnl(a))
00519                                     sl_off--;
00520 
00521                             } else if (sl_off==1 && isnl(block[0])) {
00522                                 sl_off--;
00523                             }
00524                             onBlock(block, sl_off, pe);
00525                             return;
00526                         }
00527                     }
00528                     // exit if this is the end of message 
00529                     // marker
00530                     if(eom && pos >= eomsz)
00531                     {
00532                         char *line = block + sl_off;
00533                         size_t i = 0;
00534                         for(; i < eomsz; i++)
00535                             if(eom[i] != line[i])
00536                                 break;
00537                         if(i==eomsz) // if eom found
00538                         {
00539                             onBlock(block, sl_off,
00540                                 pe);
00541                             return; 
00542                         }
00543                     }
00544                 }
00545                 // append the saved NL sequence
00546                 for(int i = 0; nlbuf[i] != 0; i++)
00547                     block[blkpos++] = nlbuf[i];
00548                 block[blkpos] = 0;
00549                 sl_off = blkpos;
00550                 pos = 0;
00551             } else {
00552                 pos++; // line pos
00553                 block[blkpos++] = c;
00554                 ++m_bit; 
00555             }
00556         }
00557         // eof
00558         block[blkpos] = 0;
00559         onBlock(block, blkpos, pe);
00560     }
00561 };
00562 
00563 
00564 /*
00565  * Forward Iterator
00566  */
00567 template<typename Iterator>
00568 struct IteratorParser<Iterator, std::forward_iterator_tag>: 
00569     public IteratorParser<Iterator, std::input_iterator_tag>
00570 {
00571     /* input_iterator ops
00572      * *it = xxx
00573      * X& op++
00574      * X& op++(int)
00575      */
00576     typedef IteratorParser<Iterator, std::input_iterator_tag> base_type;
00577     IteratorParser(MimeEntity& me)
00578     : base_type(me)
00579     {
00580     }
00581 };
00582 
00583 /*
00584  * Bidirectional Iterator
00585  */
00586 template<typename Iterator>
00587 struct IteratorParser<Iterator, std::bidirectional_iterator_tag>:
00588     public IteratorParser<Iterator, std::forward_iterator_tag>
00589 {
00590     typedef IteratorParser<Iterator, std::forward_iterator_tag> base_type;
00591     IteratorParser(MimeEntity& me)
00592     : base_type(me)
00593     {
00594     }
00595 };
00596 
00597 /*
00598  * Random Access Iterator
00599  */
00600 template<typename Iterator>
00601 struct IteratorParser<Iterator, std::random_access_iterator_tag>:
00602     public IteratorParser<Iterator, std::bidirectional_iterator_tag>
00603 {
00604     typedef IteratorParser<Iterator, std::bidirectional_iterator_tag> base_type;
00605     IteratorParser(MimeEntity& me)
00606     : base_type(me)
00607     {
00608     }
00609 private:
00610     using base_type::peIgnore;
00611     using base_type::pePreamble;
00612     using base_type::peBody;
00613     using base_type::peEpilogue;
00614     
00615     using base_type::NoBoundary;
00616     using base_type::Boundary;
00617     using base_type::ClosingBoundary;
00618     using base_type::HigherLevelBoundary;
00619     
00620     using base_type::m_boundaryList;
00621     using base_type::m_lastBoundary;
00622     using base_type::m_entityStack;
00623     using base_type::m_me;
00624     using base_type::m_iMask;
00625     using base_type::m_bit;
00626     using base_type::m_eit;
00627     using base_type::isnl;
00628     
00629     typedef TreeNode<char> BoundaryTree;
00630     inline void onBlock(Iterator bit, int size, ParsingElem pe)
00631     {
00632         if(pe == peIgnore)
00633             return;
00634         Iterator eit = bit + size;
00635         MimeEntity* pMe = m_entityStack.top();
00636         switch(pe)
00637         {
00638         case pePreamble:
00639             pMe->body().preamble().append(bit, eit);
00640             break;
00641         case peEpilogue:
00642             pMe->body().epilogue().append(bit, eit);
00643             break;
00644         case peBody:
00645             pMe->body().append(bit, eit);
00646             break;
00647         }
00648     }
00649     void copy_until_boundary(ParsingElem pe)
00650     {
00651         // if we don't have any boundary copy until m_eit and return
00652         if(m_boundaryList.empty())
00653         {
00654             onBlock(m_bit, m_eit-m_bit, pe);
00655             m_bit = m_eit;
00656             return;
00657         }
00658         // search for current boundary; if not found (i.e. malformed
00659         // message) repeat the search for higher level boundary
00660         // (slow just for malformed msg, very fast otherwise)
00661         typename base_type::BoundaryList::const_iterator 
00662             bBit = m_boundaryList.begin(), bEit = m_boundaryList.end();
00663         m_lastBoundary = NoBoundary;
00664         int depth = 0;
00665         for( ;bBit != bEit; ++bBit, ++depth)
00666         {
00667             const std::string& boundary = *bBit;
00668             Iterator off;
00669             if( (off=utils::find_bm(m_bit,m_eit,boundary)) != m_eit)
00670             {
00671                 Iterator base = m_bit;
00672                 size_t block_sz = off - base;
00673                 m_lastBoundary = 
00674                     (depth ? HigherLevelBoundary: Boundary);
00675                 off += boundary.length();
00676                 m_bit = off;
00677                 if(off<m_eit-1 && *off =='-' && *(off+1) == '-')
00678                 {
00679                     m_lastBoundary = ClosingBoundary;
00680                     m_bit = off + 2;
00681                 }
00682                 if(m_bit < m_eit-1 && isnl(*m_bit)) 
00683                 {
00684                     char c = *m_bit++;
00685                     char next = *m_bit;
00686                     if(isnl(next) && next != c)
00687                         ++m_bit;
00688                 }
00689 
00690                 // trim last newline
00691                 if(block_sz)
00692                 {
00693                     Iterator p = base + block_sz;
00694                     char a = *--p, b = *--p;
00695                     if(isnl(a,b))
00696                         block_sz -= 2;
00697                     else if(isnl(a))
00698                         block_sz--;
00699                 }
00700                 onBlock(base, block_sz, pe);
00701                 return;
00702             } else {
00703                 onBlock(m_bit, m_eit-m_bit, pe);
00704                 m_bit = m_eit;
00705             }
00706         }
00707     }
00708     BoundaryTree m_boundaryTree;
00709     void buildBoundaryTree()
00710     {
00711         m_boundaryTree = BoundaryTree(); // clear
00712         typename base_type::BoundaryList::const_iterator 
00713             bit = m_boundaryList.begin(), eit = m_boundaryList.end();
00714         BoundaryTree::NodeList *pChilds;
00715         BoundaryTree::NodeList::iterator it;
00716         int depth = 0;
00717         for( ; bit != eit; ++bit)
00718         {
00719             pChilds = &m_boundaryTree.childList();
00720             it = pChilds->begin();
00721             const char *w = bit->c_str();
00722             do
00723             {
00724                 it = find_if(pChilds->begin(), pChilds->end(), 
00725                         FindNodePred<char>(*w));
00726                 if( it == pChilds->end() )
00727                     it = pChilds->insert(pChilds->end(),*w);
00728                 pChilds = &it->childList();
00729                 depth++;
00730             } while(*(++w));
00731         }
00732     }
00733 
00734 };
00735 
00736 }
00737 
00738 #endif