开发者

CURLOPT_WRITEFUNCTION pointer to member function

I'm trying to include this curl function in to my class but having trouble with CURLOPT_WRITEFUNCTION. Following the compilation didn't find me a solution. Also tried some things based on stackoverflow to no available.

Here's my attempt (replacing 'writer' in this code)

node::writer &node::writer std::bind1st(std::mem_fun(&node::writer), this);

Here's my code:

#ifndef NODE_H_
#define NODE_H_

int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);

/*
 * function prototypes
 */

class node {
 /*
  * general struct to hold html element properties
  */
 struct tag;

 /*
  * the url and source of the page
  */
 std::string url;
 std::string source;

 /*
  *  vector of structures that store tag elements
  */
 std::vector<tag> heading;
 std::vector<tag> anchor;

 /*
  * grab source with curl
  */
 std::string curlHttpget(const std::string &url);

 /*
  * add tag structs to vector
  * @see std::vector<tag> heading
  * @see std::vector<tag> anchor
  */
 void add_heading(std::string, std::string);
 void add_anchor(std::string, std::string);

public:
 /*
  * constructors
  */
 node(){}
 node(std::string);

 /*
  * deconstructors
  */
 ~node(){}

 /*
  * crawl page
  */
 void load(std::string seed);//crawls the page

 /*
  * anchor tags
  */
 void get_anchors();// scrape the anchor tags
 void display_anchors();

 /*
  * heading tags
  */
 void get_headings();// scrape heading tags
 void display_headings();
};
/*
 * for all stored html elements
 */
struct node::tag {
 std::string text;
 std::string properties;
 tag(std::string t, std::string p) : text(t), properties(p) {}
};

/*
 * constructors
 */
node::node(std::string seed) {
 load(seed);
 get_anchors();
 get_headings();
}
/*
 * araneus::subroutines
 */

// crawl the page
void node::load(std::string seed) {
 url = seed;
 source = curlHttpget(url);
}


//scrape html source
std::string node::curlHttpget(const std::string &url) {
 std::string buffer;

 CURL *curl;
 CURLcode result;

 curl = curl_easy_init();

 if (curl) {
  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
  curl_easy_setopt(curl, CURLOPT_HEADER, 0);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);

  result = curl_easy_perform(curl);//http get performed

  curl_easy_cleanup(curl);//must cleanup

  //error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
  if (result == CURLE_OK) {
   return buffer;
  }
  //curl_easy_strerror was added in libcurl 7.12.0
  std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
  return "";
 }

 std::cerr << "error: could not initalize curl" << std::endl;
 return "";
}

void node::get_headings() {
 static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text;
 string properties;

 int count = 0;
 for (;p != end; count++, ++p)
 {
  string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_heading(text, properties);
  }
  else {
   properties = m;
  }
 }
}

//use regex to find anchors in source
void node::get_anchors() {
 static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
 static const regex relative("^\\/");
 static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
 static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text, properties;

 int count = 0;
 for (; p != end; count++, ++p) {
  std::string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_anchor(text, properties);
  }
  else {
   if(regex_search(m, relative)) { //if link is in "/somewhere" format
    properties = url + m;
   }
   else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
    properties = m;
   }
   else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
    properties = url + "/" + m;
   }
   else {
    std::cout << "link of unknown protocol: " << m << std::endl;
   }
  }
 }
}

void node::add_heading(std::string text, std::string properties) {
 heading.push_back(tag(text, properties));
}

void node::display_headings() {
 for(int i = 0; i < (int)heading.size(); i++) {
  std::cout<< "[h]: " << heading[i].text << endl;
  std::cout<< "[h.properties]: " << heading[i].properties << endl;
 }
 cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}

void node::add_anchor(std::string text, std::string properties) {
 anchor.push_back(tag(text, properties));
}

void node::display_anchors() {
 for(int i = 0; i < (int)anchor.size(); i++) {
  std::cout<< "[a]: " << anchor[i].text << endl;
  std::cout<< "[a.properties]: " << anchor[i].properties << endl;
 }
 cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}

//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
 int result = 0;

 if (buffer != NULL) {
  buffer->append(data, size * nmemb);
  result = size * nmemb;
 }开发者_如何学C
 return result;
}

#endif /* NODE_H_ */

looking for a solution to get the function 'int writer' to be "int node::writer". the problem occurs in std::string node::curlHttpget, when I call CURLOPT_WRITEFUNCTION.

&node::writer compiles but gives a seg fault =/

thanks


Instead of using std::string* use node* as the parameter or another class like HttpGet that has a std::string and a pointer back to your node so it can write to the string and access your node on each call.

boost::bind won't work for C-API callbacks.

It compiles because curl_easy_setopt uses ... so is totally not typesafe. You can pass it any type you want under the sun and it will compile. It probably won't run though, as you found to your cost.

I would go for the extra type-safety of making your function have exactly the same signature as Curl_write_callback i.e. void* as the 4th parameter, and do the casting in the function implementation.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜