花之剑'HOME

一朵飘舞在风中的雪花,挣扎着,不想被融化。

解析html文件(原创)

Posted on 2008-04-20 11:00 花之剑 阅读(549) 评论(0)  编辑  收藏

很简单的解析html文件内容 过滤script 和 css
隔开中英文 以便切词使用。速度没测试过
应该不是很快,有待优化

#include  < stdio.h >
#include 
< stdlib.h >
#include 
< time.h >
#include
< string .h >
#ifdef _WIN32
#    include 
< conio.h >
#    include 
< tchar.h >
#    include
< winsock2.h >
#    include 
< errno.h >
#    include 
< assert.h >
#elif  defined(_LINUX) || defined(__LINUX)  //  linux
#    include 
< iconv.h >
#    include 
< errno.h >
#    include 
< signal.h >
#    include 
< execinfo.h >
#    include 
< termios.h >
#    include 
< unistd.h >
#    include 
< assert.h >
#    include 
< string .h >
#    include 
< sys / ioctl.h >
#    include 
< sys / time.h >
#    include 
< sys / types.h >
#    include 
< dlfcn.h >
#   include
< sys / select.h >
#endif
const   char   * g_html  =   " source.html " ;
const   char   * g_dest  =   " dest.txt " ;
void  HtmlToText( char *  inbuffer, char *  outbuffer)   
 
{   
    
int    bIsText    =    0 ;  
    
while ( * inbuffer)   
   
{   
           
if ( ! strncmp(inbuffer, " <style " ,strlen( " <style " )))
            
{
                 
while (strncmp(inbuffer, " </style> " ,strlen( " </style " )))
                 
{
                       inbuffer
++ ;                                                
                 }

                 inbuffer
+= strlen( " </style> " );
                 
continue ;
            }

            
if ( ! strncmp(inbuffer, " <script " ,strlen( " <script " )))
            
{
                 
while (strncmp(inbuffer, " </script> " ,strlen( " </script> " )))
                 
{
                       inbuffer
++ ;                                                
                 }

                 inbuffer
+= strlen( " </script> " );
                  
continue ;
            }
  
        
if ( * inbuffer    ==     ' < ' )     
        
{   
            bIsText   
=    0 ;
 
        }
   
       
else     if ( * inbuffer    ==     ' > ' )   
      
{   
            bIsText   
=    1 ;   
           inbuffer
++ ;   
            
continue ;   
        }
;   
       
if (bIsText)   
        
{   

               
* outbuffer   =     * inbuffer;   
               outbuffer
++ ;   
               
* outbuffer    =     ' \0 ' ;   
       }
   
        inbuffer
++ ;   
  }
     
 }

 
/* 整理字符串(对标点符号,中英文混排等初步处理) */
  
void   ReviseString( char   * str, char   * reviseContent)
  
{
    
char  splitChar  = '   ' ;         // 分割符号
     long   int  slen = strlen(str);
    
int  prechar = 0 ;                 //  0-空白 1-英文 2-中文 3-符号
     long   int  i = 0 ;
    
while (i ++< slen)
    
{
      
if (str[i] <= 64   &&  str[i] > 0  )
       
{
           
if (prechar == 0 )
           
{
               
if (str[i] == ' \n '   ||  str[i] == ' \r '   ||  str[i] == '   ' )
               
{
                 
continue ;
               }

            }
else
           
{
               strncat(reviseContent,
& splitChar, 1 );prechar = 0 ;
               
continue ;    
           }

       }
else   if (str[i] > 64 )
        

           
if (prechar == 2   ||  prechar == 3 ) {strncat(reviseContent, & splitChar, 1 ); }
            strncat(reviseContent,
& str[i], 1 );
           prechar
= 1 ;
       }
else   if (str[i] < 0 )
        
{
            
if (prechar != 0 &&  prechar != 2 ) {strncat(reviseContent, & splitChar, 1 );}
           strncat(reviseContent,
& str[i], 1 );
            prechar
= 2 ;

       }

        
    }

}

int  main()
{
    FILE
*  fp  =  NULL;
    FILE
*  fw  =  NULL;
    
if ((fp  =  fopen(g_html, " r " ))  ==  NULL)
    
{
           puts(
" open source file errod " );
           exit(
- 1 );
    }

    
if ((fw  =  fopen(g_dest, " a " ))  ==  NULL)
    
{
           puts(
" open desr file errod " );
           exit(
- 1 );
    }

    
    fseek(fp,
0 ,SEEK_END);
    
int  len  =  ftell(fp);
    
char   * tmp, * buff;
    tmp 
=  buff  =  ( char   * )malloc(len * sizeof ( char ) + 1 );
    fseek(fp,
0 ,SEEK_SET);
    memset(buff,
0 ,len + 1 );
    fread(buff,len,
1 ,fp);
    
char  outbuffer[len + 1 ];
    
char  textBuffer[len + 1 ];
    HtmlToText(tmp,outbuffer);
    ReviseString(outbuffer,textBuffer);
    fprintf(fw,
" %s " ,textBuffer);
      delete[] tmp;
     tmp = buff = NULL;
    getchar();
    
return   0 ;
}


只有注册用户登录后才能发表评论。


网站导航: