Ray: /ray/src/lib/tl/parsestring.cc Source File

00001 /*
00002  * lib/tl/parsestring.cc
00003  * 
00004  * Implementation of C-style string parsing for TLString::ParseString(). 
00005  * 
00006  * Copyright (c) 2004 by Klaus Sausen <nicolasius § users.sourceforge.net>
00007  *                   and Wolfgang Wieser ] wwieser (a) gmx <*> de [
00008  * 
00009  * This file may be distributed and/or modified under the terms of the 
00010  * GNU General Public License version 2 as published by the Free Software 
00011  * Foundation. (See COPYING.GPL for details.)
00012  * 
00013  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
00014  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
00015  * 
00016  */
00017 
00018 #include "tlstring.h"
00019 #include <lib/serror.h>
00020 #include <stdio.h>  // FIXME
00021 
00022 TLString TLString::ParseString(const char *instr,ssize_t inlen,SError &error)
00023 {
00024     if(!instr)  return TLString();
00025     if(inlen<0)
00026     {  inlen=strlen(instr);  }
00027     
00028     // Be sure that the string is NUL-terminated. Normally, that is not 
00029     // needed but it is better here because I'm not sure for flex...
00030     Assert(instr[inlen]=='\0');
00031     
00032     // No error yet; make sure success is set. 
00033     error.clear();
00034     
00035     // This will hold the parsed string: 
00036     TLString dest_str;
00037     // The parsed string will in no case be longer than the input; 
00038     // normally it will be slightly shorter, so allocating the length of 
00039     // the input is the best guess we have and much better than to 
00040     // gradually enlarge the string. 
00041     dest_str.d=new SData(inlen+1);   // +1 for terminating '\0'
00042     SData *db=dest_str.d;
00043     
00044     // Now actually parse the string. Luckily Klaus did most of it :)
00045     //------------------------------------------------------------------
00046     
00049     
00050     // d = dest; s=src
00051     const char *s=instr,*send=s+inlen;
00052     char *d=db->str;
00053     int8 cnt=0;
00054     bool quotemode=false;  // true -> treat double quotes specially
00055     if (*s=='"') {
00056         quotemode=true;
00057         s++;
00058     }
00059     while(s<send)
00060     {
00061         if(*s=='\\')
00062         {   // Handle escape sequences. 
00063             s++;
00064             switch(*s++) {
00065             case 'X'://interpret the next four chars as a hexnumber
00066                 cnt+=2;
00067             case 'x'://interpret the next two chars as a hexnumber
00068             {   cnt+=2;
00069                 int  num=0;
00070                 bool valid=false;
00071                 //  (plain "\x" is illegal; need at least one number) 
00072                 do {
00073                     if ((*s<='9')&&(*s>='0')) {
00074                         num=(num<<4)|(*s-'0');
00075                         valid=true;
00076                     } else
00077                     if ((*s<='F')&&(*s>='A')) {
00078                         num=(num<<4)|(*s-'A'+0xa);
00079                         valid=true;
00080                     } else
00081                     if ((*s<='f')&&(*s>='a')) {
00082                         num=(num<<4)|(*s-'a'+0xa);
00083                         valid=true;
00084                     } else {
00085                         cnt=0;
00086                         if (!valid) { //unparsable char
00087                             error=SError("invalid \\x formatting",1);
00088                         } else  //emit byte character   
00089                             *d++=num;
00090                         if (s>=send) { //reached EOS
00091                             goto end_of_string;
00092                         }
00093                         break;
00094                     }
00095                     if (cnt&1) //emit byte
00096                         *d++=num;
00097                     s++;
00098                 } while (--cnt);
00099             }   break;
00100             //this also handles "\\0"
00101             case '0'://interpret the next three chars as an octal num
00102             {   cnt+=3;
00103     interpret_only_two_chars:
00104                 int num=0;
00105                 do {    //this does *not* allow \\07 but \\007. ?? It works (WW)
00106                     if ((*s<='7')&&(*s>='0')) {
00107                         num<<=3;
00108                         num|=(*s++)-'0';
00109                     } else //escape the escape
00110                         cnt=1;
00111                 } while (--cnt);
00112                 // Not needed due to while(s<send) loop: (WW)
00113                 // ...and would be a bug anyways for strings like "a\0377"...
00114                 //if (s>=send)  goto end_of_string;
00115                 if (num>>8) {  // <-- ">=256"
00116                     cnt=2;
00117                     s-=3;
00118                 goto interpret_only_two_chars;  
00119                 }
00120                 *d++=num&0xff;
00121             }   break;
00122             case 't': *d++='\t'; break;
00123             case 'n': *d++='\n'; break;
00124             case 'r': *d++='\r'; break;
00125             case 'f': *d++='\f'; break;
00126             case 'v': *d++='\v'; break;
00127             case 'a': *d++='\a'; break;
00128             case 'b': *d++='\b'; break;
00129             case '^'://control sequence
00130                 if ((*s<='Z')&&(*s>='A'))
00131                     *d++=*s-'A'+1;
00132                 else
00133                     error=SError("warning: unknown ctrl char",1);
00134                 s++;
00135                 break;
00136             case '"': // escaped quote outside quotemode.. catch it
00137             case '\'':// escaped single quote
00138             case '\\'://simply copy those
00139                 *d++=*(s-1);
00140                 break;
00141             default:
00142                 error=SError("warning: unknown escape seq",1);
00143                 break;
00144             }
00145         }
00146         else    //scan for next quote and ignore any white space sequences:
00147         if ((quotemode)&&(*s=='"')) {
00148             bool fault=false;
00149             do {
00150                 s++;
00151                 switch(*s) {
00152                 case  ' '://these whitespaces are ignored
00153                 case '\r':
00154                 case '\n':
00155                 case '\t':
00156                 case '\v':
00157                 case '\f':
00158                 case '\0': break;  // ignore also NULs
00159                 case  '"':
00160                     ++s;  // Skip the double quote. 
00161                     goto quotation_starts_again;
00162                 case '\\':  if(s[1]=='\n') break;
00163                 default://anything else will result in an error
00164                     fault=true;  // We ignore it for now and scan to the 
00165                     break;       // opening quote first. 
00166                 }
00167             } while (s<send);
00168 quotation_starts_again:;
00169             if (fault) {
00170                 error=SError("non-whitespace between string fragments",1);
00171             }
00172             // Not needed as we will run through while(s<send) immediately (WW)
00173             //if (s>=send)  goto end_of_string;
00174         } else {// The rest gets just copied. 
00175             *d++=*s++;
00176         }
00177     }
00178 end_of_string:
00179     // Make sure that d->str[d->len]='\0'. The string must be '\0'-terminated 
00180     // even if there are more NULs embedded. 
00181     *d='\0';  // terminate
00182     db->len=d-db->str;
00183     
00184     //------------------------------------------------------------------
00185     // Done parsing. 
00186     
00187     Assert(db->len<db->asize);  // NOT <= for terminating '\0'
00188     Assert(db->str[db->len]=='\0');
00189     
00190     // If the parsed string is significantly smaller than the input, 
00191     // resize the data to free excess memory: 
00192     db->DownSizeIfNeeded(db->len+1);
00193     
00194     return(dest_str);
00195 }
00196 
00197 
00198 #if 0  /* Little test program */
00199 
00200 #include <unistd.h>
00201 #include <stdio.h>   /* test program */
00202 
00203 // now try this: 
00204 // bash# make
00205 // bash# gcc tlstring.o -o test lib_tl.a ../lib_misc.a ../threads/lib_threads.a -lpthread
00206 // bash# ./test "\\X5241\\x59.\\X4C41\\X554eCH();\\^M\\^J\\tandsuch"
00207 static void DoParseString(const char *instr)
00208 {
00209     SError error;
00210     TLString str=TLString::ParseString(instr,-1,error);
00211     
00212     fprintf(stderr,"Parsed string has length %u.\n",str.length());
00213     
00214     // This is semi-clean but OK for a test program. 
00215     // Using fprintf(stderr,"str: >%s<\n",str); will not play for embedded 
00216     // NUL characters, hence use system write(). 
00217     write(2,"str=>",5);
00218     write(2,str.str(),str.length());
00219     write(2,"<\n",2);
00220     for (uint a=0;a<str.length();a++)
00221         printf("%3x",(unsigned char)str.str()[a]);
00222     printf("\n");   
00223     if(error)
00224     {  fprintf(stderr,"Error occured: %s\n",error.msg().str());  }
00225 }
00226 
00227 
00228 int main(int argc,char **arg)
00229 {
00230     char *s="\\X5241\\x59.\\X4C41\\X554eCH();\\^M\\^J\\tand_\\0_such";
00231     char*s2="\"this is the line wrapping-(\" \t \t\r\t\n \t\")-test with \\x52\\X4159\"";
00232     s=s2;
00233     if(argc==1)
00234     {
00235         printf("will parse string %s\n",s);
00236         DoParseString(s);
00237     }
00238     bool interactive=0;
00239     for(int i=1; i<argc; i++)
00240     {
00241         DoParseString(arg[i]);
00242         if(arg[i][0]=='-' && !arg[i][1])  interactive=1;
00243     }
00244     if(interactive)
00245     {
00246         const size_t buflen=256;
00247         char buf[buflen];
00248         while(!feof(stdin) && !ferror(stdin))
00249         {
00250             if(!fgets(buf,256,stdin)) continue;
00251             size_t len=strlen(buf);
00252             if(len && buf[len-1]=='\n')   buf[--len]='\0';
00253             DoParseString(buf);
00254         }
00255     }
00256     
00257     return(0);
00258 }
00259 
00260 #endif