curl not grabbing page on second pass instead returning an empty string?
I have the following code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
char * return_next(char *link, int rand_flag);
char* strip_parens(char* string);
char* strip_itals(char* string);
char* strip_tables(char* string);
struct MemoryStruct {
char *memory;
size_t size;
};
static size_t
WriteMemoryCallback(void *ptr, size_t size, size_t nmemb, void *data)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)data;
mem->memory = realloc(mem->memory, mem->size + realsize + 1);
if (mem->memory == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
exit(EXIT_FAILURE);
}
memcpy(&(mem->memory[mem->size]), ptr, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
int main(void)
{
char *page = malloc(1000);
page = strcpy(page, "http://en.wikipedia.org/wiki/Literature");
char *start = malloc(1000);
start = strcpy(start, page);
printf("%s\n\n", page);
int i = 0, rand_flag = 0;
while(strcmp(page, "http://en.wikipedia.org/wiki/Philosophy")){
i++;
page = return_next(page, rand_flag);
printf("deep: %d, %s\n\n", i, page);
rand_flag = 0;
}
printf("start link: %s, is %d clicks from philosophy", start, i);
return 0;
}
char * return_next(char *link, int rand_flag){
CURL *curl_handle;
struct MemoryStruct chunk;
chunk.memory = malloc(1);
chunk.size = 0;
curl_global_init(CURL_GLOBAL_ALL);
curl_handle = curl_easy_init();
curl_easy_setopt(curl_handle, CURLOPT_URL, link);
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
if(rand_flag){
curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1);
}
curl_easy_perform(curl_handle);
curl_easy_cleanup(curl_handle);
char *theString = malloc(strlen(chunk.memory)+1);
char *theString1 = malloc(strlen(theString) + 1);
theString = strstr(chunk.memory, "div id=\"body");
theString1 = strip_tables(theString);
if(chunk.memory)
free(chunk.memory);
theString = strstr(theString1, "<p>");
theString1 = strip_itals(theString);
theString = strip_parens(theString1);
curl_global_cleanup();
return theString;
}
char* strip_parens(char* string) {
long len = strlen(string);
char* result = malloc(len + 1);
int num_parens = 0;
int i, j = 0;
for(i=0; i < len; i++) {
char c = string[i];
if(c == '(') {
num_parens++;
}
else if(c == ')' && num_parens > 0) {
num_parens--;
}
else if(num_parens == 0) {
if(c == '<'){
if (string[i+1] == 'a'){
if (string[i+2] == ' ') {
if(string[i+3] == 'h'){
i = i+9;
for(;string[i] != '"'; i++){
result[j] = string[i];
开发者_JAVA百科 j++;
}
result[j] = '\0';
len = strlen("http://en.wikipedia.org");
char *final = malloc(j+len);
final = strcpy(final, "http://en.wikipedia.org");
return strcat(final, result);
}
}
}
}
}
}
result[j] = '\0';
return result;
}
char* strip_itals(char* string) {
long len = strlen(string);
char* result = malloc(len + 1);
int inside = 0;
int i, j = 0;
for(i=0; i < len; i++) {
//printf(".%d, %c, %d\n", i, string[i], inside);
char c = string[i];
if(c == '<' && inside == 0) {
if (string[i+1] == 'i'){
if (string[i+2] == '>') {
inside++;
i = i+2;
}
}
}
else if(c == '<' && inside > 0) {
//printf("first if\n");
if (string[i+1] == '/'){
if (string[i+2] == 'i') {
inside--;
i=i+3;
}
}
}
if(inside == 0) {
result[j] = c;
j++;
}
}
result[j] = '\0';
return result;
}
char* strip_tables(char* string) {
//printf("%s\n", string);
long len = strlen(string);
//long len = 1000000;
char* result = malloc(len + 1);
int inside = 0;
int i, j = 0;
for(i=0; i < len; i++) {
//printf(".%d, %c, %d\n", i, string[i], inside);
char c = string[i];
if(c == '<' && inside == 0) {
if (string[i+1] == 't'){
if (string[i+2] == 'a') {
if (string[i+3] == 'b') {
if (string[i+4] == 'l') {
inside++;
i = i+4;
}
}
}
}
}
else if(c == '<' && inside > 0) {
//printf("first if\n");
if (string[i+1] == '/'){
if (string[i+2] == 't') {
if (string[i+3] == 'a') {
if (string[i+4] == 'b') {
if (string[i+5] == 'l') {
inside--;
i=i+7;
}
}
}
}
}
}
if(inside == 0) {
result[j] = c;
j++;
}
}
result[j] = '\0';
return result;
}
That given a link to a wiki article will return the first link back, then in main I loop over this function till I arrive at a specified article. I ran from some random article and discovered when it passes over "Literature" it gets "Art" as the next page but when it goes to search Art curl returns a blank string- if i print("%s", chunk.memory) after the call I get (null). If I manually force the function to start at art it works fine, trailing all the way to philosophy. For the life of me I cant see any differences... I put some diagnostic printfs in and got the following-
this is the address ~> !http://en.wikipedia.org/wiki/Art!, rand flag = 0
With the link inbetween the exlamation marks, so I know it's parsing the link back properly, and rand_flag is always set to 0 at the moment.
Any tips, pointers or solutions much appreciated.
It is not generally possible to say anything about a program if all you have is an uncompilable piece of code. So I'm going to give some generic recommendations.
- Check return values of your functions.
- Set up callbacks to libcurl so that you can print every byte that goes in and out with a flip of a switch (much like
curl -v
does — look at its source if you need guidance). - Sniff your network traffic.
- If you see that a request is not sent at all, or that it's sent but no data is returned, you have narrowed your problem a bit.
The code is complete borked and will not work. Let me illustrate by snipping a piece of the code and comment:
char *theString = malloc(strlen(chunk.memory)+1);
char *theString1 = malloc(strlen(theString) + 1);
Doing strlen(theString) will call strlen() on a pointer that points to uninitialized memory. Can be anything. And then you allocate that size and put in a second pointer...
theString = strstr(chunk.memory, "div id=\"body");
... and yet you assign 'theString' again to a position within the 'chunk.memory' memory area.
theString1 = strip_tables(theString);
And you assign 'theString1' again to some position within 'theString'.
You've now leaked your two mallocs.
if(chunk.memory) free(chunk.memory);
And look, you now freed the data your two pointers are pointing to. They now point to garbage.
theString = strstr(theString1, "
");
... and now you search in the data you already freed.
Do I need to say more?
精彩评论