开发者

cannot extract the text alone using htmlcxx

include

#include <iostream>
#include <sstream>
#include <curl/curl.h>
  #include <htmlcxx/html/ParserDom.h>
  #include <iostream>
  using namespace std;
  using namespace htmlcxx;

static size_t http_write(void* buf, size_t size, size_t nmemb, void* userp)
{
    if(userp)
    {
        ostringstream* oss = static_cast<ostringstream*>(userp);
        streamsize len = size * nmemb;
        oss->write(static_cast<char*>(buf), len);
        return nmemb;
    }

    return 0;
}

string get_html_page(const string& url, long timeout = 0)
{
    CURL* curl = curl_easy_init();

    ostringstream oss;

    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &http_write);
    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
    curl_easy_setopt(curl, CURLOPT_FILE, &oss);
    curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

    curl_easy_perform(curl);
    curl_easy_cleanup(curl);

    return oss.str();
}

int main()
{
    string html = get_html_page("http://www.google.co.in");

    //cout << html << endl;
     HTML::ParserDom parser;
      tree<HTML::Node> dom = parser.parseTree(html);

      //Print whole DOM tree
      //cout <<dom <<endl;

      //Dump all links in the tree
      tree<HTML::Node>::iterator it = dom.begin();
      tree<HTML::Node>::iterator end = dom.end();
      for (; it !=end; ++it)
      {
         if (strcasecmp(it->tagName().c_str(), "A") == 0)
         {
           it->parseAttributes();
           //cout << it->attribute("href").second << endl;
         }
      }

      //Dump all text of the document
      it = dom.begin();
      end = dom.end();
      for (; it != end; ++it)
      {
        if ((!it->isTag()) && (!it->isComment()))
        {
          cout << it->text();
        }
      }
    //  cout << endl;
    return 0;
}

i use this code to extract the text alone from the htmlpage it extracts the java script ode also is there is any wrong with my code??

OUTPUT:

 Googlewindow.google={kEI:"0a97TLvcFMS7rAe5htz9Ag",kEXPI:"25901,26119,26325",kCSI:{e:"25901,26119,26325",ei:"0a97TLvcFMS7rAe5htz9Ag",expi:"25901,26119,26325"},ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(b,d,c){var a=new Image,e=google,g=e.lc,f=e.li;a.onerror=(a.onload=(a.onabort=function(){delete g[f]}));g[f]=a;c=c||"/gen_204?atyp=i&ct="+b+"&cad="+d+"&zx="+google.time();a.src=c;e.li=f+1},lc:[],li:0,Toolbelt:{}};
window.google.sn="webhp";window.google.timers={load:{t:{start开发者_C百科:(new Date).getTime()}}};try{}catch(u){}window.google.jsrt_kill=1;
var _gjwl=location;function _gjuc(){var e=_gjwl.href.indexOf("#");if(e>=0){var a=_gjwl.href.substring(e);if(a.indexOf("&q=")>0||a.indexOf("#q=")>=0){a=a.substring(1);if(a.indexOf("#")==-1){for(var c=0;c<a.length;){var d=c;if(a.charAt(d)=="&")++d;var b=a.indexOf("&",d);if(b==-1)b=a.length;var f=a.substring(d,b);if(f.indexOf("fp=")==0){a=a.substring(0,c)+a.substring(b,a.length);b=c}else if(f=="cad=h")return 0;c=b}_gjwl.href="/search?"+a+"&cad=h";return 1}}}return 0}function _gjp(){!(window._gjwl.hash&&
window._gjuc())&&setTimeout(_gjp,500)};
window._gjp && _gjp()body{margin:0}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}body,td,a,p,.h{font-family:arial,sans-serif}.h{color:#36c;font-size:20px}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{width:496px}.tiah{width:458px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}#gog{background:#fff}#gbar,#guser{font-size:13px;padding-top:1px !important}#gbar{float:left;height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}#gbs,.gbm{background:#fff;left:0;position:absolute;text-align:left;visibility:hidden;z-index:1000}.gbm{border:1px solid;border-color:#c9d7f1 #36c #36c #a2bae7;z-index:1001}.gb1{margin-right:.5em}.gb1,.gb3{zoom:1}.gb2{display:block;padding:.2em .5em}.gb2,.gb3{text-decoration:none;border-bottom:none}a.gb1,a.gb2,a.gb3,a.gb4{color:#00c !important}a.gb2:hover{background:#36c;color:#fff !important}body{background:#fff;color:black}input{-moz-box-sizing:content-box}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#4272db}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff!important}.ds{display:-moz-inline-box}.ds{border-bottom:solid 1px #e7e7e7;border-right:solid 1px #e7e7e7;display:inline-block;margin:3px 0 4px;margin-left:4px}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px;}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px;display:block}.lsb{background:url(/images/srpr/nav_logo14.png) bottom;font:15px arial,sans-serif;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}.ftl,#fll a{margin:0 12px}#addlang a{padding:0 3px}.gac_v div{display:none}.gac_v .gac_v2,.gac_bt{display:block!important}google.y={};google.x=function(e,g){google.y[e.id]=[e,g];return false};window.gbar={qs:function(){},tg:function(e){var o={id:'gbar'};for(i in e)o[i]=e[i];google.x(o,function(){gbar.tg(o)})}};Web Images Maps News Orkut Books Gmail more &#9660;Translate Scholar Blogs YouTube Calendar Photos Documents Reader Sites Groups even more &raquo; iGoogle | Search settings | Sign in India&nbsp;Advanced SearchLanguage ToolsGoogle.co.in offered in: Hindi Bengali Telugu Marathi Tamil Gujarati Kannada Malayalam PunjabiAdvertising&nbsp;ProgramsAbout GoogleGo to Google.com&copy; 2010 - Privacy if(google.y)google.y.first=[];if(google.y)google.y.first=[];google.dstr=[];google.rein=[];window.setTimeout(function(){var a=document.createElement("script");a.src="/extern_js/f/CgJlbhICaW4gACswRTgBLCswWjgDLCswDjgALCswFzgHLCswJzgELCswPDgDLCswUTgDLCswCjhzQB0sKzAWOB0sKzAZOCAsKzAlOMqIASwrMDU4BCwrMEA4EiwrMEE4BSwrME44BiwrMFQ4ASwrMBg4BSwrMCY4DSyAAheQAhg/x2R96GGjycQ.js";(document.getElementById("xjsd")||document.body).appendChild(a);if(google.timers&&google.timers.load.t)google.timers.load.t.xjsls=(new Date).getTime();},0);
;google.neegg=1;google.y.first.push(function(){var form=document.f||document.f||document.gs;google.ac.i(form,form.q,'','','',{o:1,sw:1});google.History&&google.History.initialize('/')});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);google.fade=null;}(function(){
var b,d,e,f;function g(a,c){if(a.removeEventListener){a.removeEventListener("load",c,false);a.removeEventListener("error",c,false)}else{a.detachEvent("onload",c);a.detachEvent("onerror",c)}}function h(a){f=(new Date).getTime();++d;a=a||window.event;var c=a.target||a.srcElement;g(c,h)}var i=document.getElementsByTagName("img");b=i.length;d=0;for(var j=0,k;j<b;++j){k=i[j];if(k.complete||typeof k.src!="string"||!k.src)++d;else if(k.addEventListener){k.addEventListener("load",h,false);k.addEventListener("error",
h,false)}else{k.attachEvent("onload",h);k.attachEvent("onerror",h)}}e=b-d;function l(){if(!google.timers.load.t)return;google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=f;google.kCSI.imc=d;google.kCSI.imn=b;google.kCSI.imp=e;google.timers.load.t.xjs&&google.report&&google.report(google.timers.load,google.kCSI)}if(window.addEventListener)window.addEventListener("load",l,false);else if(window.attachEvent)window.attachEvent("onload",l);google.timers.load.t.prt=(f=(new Date).getTime());
})();


This is expected behavior (I am one of the library authors). You need to skip javascript tags in your code if you do not want the javascript payload to be printed. You can simply add the following as the first line in your for loop.

if (it->isTag() && strcasecasecmp(it->tagName(), "javascript") == 0) continue;


It may be best to ask the htmlcxx people, over at http://htmlcxx.sourceforge.net/ you should be able to find mailing lists or contact details.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜