CURLOPT_WRITEFUNCTION pointer to member function
I'm trying to include this curl function in to my class but having trouble with CURLOPT_WRITEFUNCTION. Following the compilation didn't find me a solution. Also tried some things based on stackoverflow to no available.
Here's my attempt (replacing 'writer' in this code)
node::writer &node::writer std::bind1st(std::mem_fun(&node::writer), this);
Here's my code:
#ifndef NODE_H_
#define NODE_H_
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);
/*
* function prototypes
*/
class node {
/*
* general struct to hold html element properties
*/
struct tag;
/*
* the url and source of the page
*/
std::string url;
std::string source;
/*
* vector of structures that store tag elements
*/
std::vector<tag> heading;
std::vector<tag> anchor;
/*
* grab source with curl
*/
std::string curlHttpget(const std::string &url);
/*
* add tag structs to vector
* @see std::vector<tag> heading
* @see std::vector<tag> anchor
*/
void add_heading(std::string, std::string);
void add_anchor(std::string, std::string);
public:
/*
* constructors
*/
node(){}
node(std::string);
/*
* deconstructors
*/
~node(){}
/*
* crawl page
*/
void load(std::string seed);//crawls the page
/*
* anchor tags
*/
void get_anchors();// scrape the anchor tags
void display_anchors();
/*
* heading tags
*/
void get_headings();// scrape heading tags
void display_headings();
};
/*
* for all stored html elements
*/
struct node::tag {
std::string text;
std::string properties;
tag(std::string t, std::string p) : text(t), properties(p) {}
};
/*
* constructors
*/
node::node(std::string seed) {
load(seed);
get_anchors();
get_headings();
}
/*
* araneus::subroutines
*/
// crawl the page
void node::load(std::string seed) {
url = seed;
source = curlHttpget(url);
}
//scrape html source
std::string node::curlHttpget(const std::string &url) {
std::string buffer;
CURL *curl;
CURLcode result;
curl = curl_easy_init();
if (curl) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_HEADER, 0);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
result = curl_easy_perform(curl);//http get performed
curl_easy_cleanup(curl);//must cleanup
//error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
if (result == CURLE_OK) {
return buffer;
}
//curl_easy_strerror was added in libcurl 7.12.0
std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
return "";
}
std::cerr << "error: could not initalize curl" << std::endl;
return "";
}
void node::get_headings() {
static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");
int const subMatches[] = { 1, 2 };
sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
sregex_token_iterator end;
string text;
string properties;
int count = 0;
for (;p != end; count++, ++p)
{
string m(p->first, p->second);
if(count % 2) {
text = m;
add_heading(text, properties);
}
else {
properties = m;
}
}
}
//use regex to find anchors in source
void node::get_anchors() {
static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
static const regex relative("^\\/");
static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");
int const subMatches[] = { 1, 2 };
sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
sregex_token_iterator end;
string text, properties;
int count = 0;
for (; p != end; count++, ++p) {
std::string m(p->first, p->second);
if(count % 2) {
text = m;
add_anchor(text, properties);
}
else {
if(regex_search(m, relative)) { //if link is in "/somewhere" format
properties = url + m;
}
else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
properties = m;
}
else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
properties = url + "/" + m;
}
else {
std::cout << "link of unknown protocol: " << m << std::endl;
}
}
}
}
void node::add_heading(std::string text, std::string properties) {
heading.push_back(tag(text, properties));
}
void node::display_headings() {
for(int i = 0; i < (int)heading.size(); i++) {
std::cout<< "[h]: " << heading[i].text << endl;
std::cout<< "[h.properties]: " << heading[i].properties << endl;
}
cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}
void node::add_anchor(std::string text, std::string properties) {
anchor.push_back(tag(text, properties));
}
void node::display_anchors() {
for(int i = 0; i < (int)anchor.size(); i++) {
std::cout<< "[a]: " << anchor[i].text << endl;
std::cout<< "[a.properties]: " << anchor[i].properties << endl;
}
cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}
//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
int result = 0;
if (buffer != NULL) {
buffer->append(data, size * nmemb);
result = size * nmemb;
}开发者_如何学C
return result;
}
#endif /* NODE_H_ */
looking for a solution to get the function 'int writer' to be "int node::writer". the problem occurs in std::string node::curlHttpget, when I call CURLOPT_WRITEFUNCTION.
&node::writer compiles but gives a seg fault =/
thanks
Instead of using std::string* use node* as the parameter or another class like HttpGet that has a std::string and a pointer back to your node so it can write to the string and access your node on each call.
boost::bind won't work for C-API callbacks.
It compiles because curl_easy_setopt uses ... so is totally not typesafe. You can pass it any type you want under the sun and it will compile. It probably won't run though, as you found to your cost.
I would go for the extra type-safety of making your function have exactly the same signature as Curl_write_callback i.e. void* as the 4th parameter, and do the casting in the function implementation.
精彩评论