开发者

curl not grabbing page on second pass instead returning an empty string?

I have the following code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>

char * return_next(char *link, int rand_flag);
char* strip_parens(char* string);
char* strip_itals(char* string);
char* strip_tables(char* string);

struct MemoryStruct {
    char *memory;
    size_t size;
};


static size_t
WriteMemoryCallback(void *ptr, size_t size, size_t nmemb, void *data)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)data;

mem->memory = realloc(mem->memory, mem->size + realsize + 1);
if (mem->memory == NULL) {
    /* out of memory! */ 
    printf("not enough memory (realloc returned NULL)\n");
    exit(EXIT_FAILURE);
}

memcpy(&(mem->memory[mem->size]), ptr, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;

return realsize;
}


int main(void)
{



char *page = malloc(1000);
page = strcpy(page, "http://en.wikipedia.org/wiki/Literature");
char *start = malloc(1000);
start = strcpy(start, page);
printf("%s\n\n", page);
int i = 0, rand_flag = 0;
while(strcmp(page, "http://en.wikipedia.org/wiki/Philosophy")){
    i++;
    page = return_next(page, rand_flag);
    printf("deep: %d, %s\n\n", i, page);
    rand_flag = 0;
}
printf("start link: %s, is %d clicks from philosophy", start, i);

return 0;

}


char * return_next(char *link, int rand_flag){
CURL *curl_handle;
struct MemoryStruct chunk;
chunk.memory = malloc(1); 
chunk.size = 0;    

curl_global_init(CURL_GLOBAL_ALL);
curl_handle = curl_easy_init();
curl_easy_setopt(curl_handle, CURLOPT_URL, link);
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
if(rand_flag){
    curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1);   
}
curl_easy_perform(curl_handle);
curl_easy_cleanup(curl_handle);

char *theString = malloc(strlen(chunk.memory)+1);

char *theString1 = malloc(strlen(theString) + 1);

theString = strstr(chunk.memory, "div id=\"body");

theString1 = strip_tables(theString);

if(chunk.memory)
    free(chunk.memory);

theString = strstr(theString1, "<p>");

theString1 = strip_itals(theString);

theString = strip_parens(theString1);

curl_global_cleanup();

return theString;
}

char* strip_parens(char* string) {
long len = strlen(string);
char* result = malloc(len + 1);
int num_parens = 0;
int i, j = 0;
for(i=0; i < len; i++) {
    char c = string[i];
    if(c == '(') {
        num_parens++;
    }
    else if(c == ')' && num_parens > 0) {
        num_parens--;
    }
    else if(num_parens == 0) {
        if(c == '<'){
            if (string[i+1] == 'a'){
                if (string[i+2] == ' ') {
                    if(string[i+3] == 'h'){
                        i = i+9;

                        for(;string[i] != '"'; i++){

                            result[j] = string[i];
                    开发者_JAVA百科        j++;
                        }
                        result[j] = '\0';
                        len = strlen("http://en.wikipedia.org");
                        char *final = malloc(j+len);
                        final = strcpy(final, "http://en.wikipedia.org");
                        return strcat(final, result);
                    }
                }
            }
        }
    }
}
result[j] = '\0';
return result;
}

char* strip_itals(char* string) {
long len = strlen(string);
char* result = malloc(len + 1);
int inside = 0;
int i, j = 0;
for(i=0; i < len; i++) {
    //printf(".%d, %c, %d\n", i, string[i], inside);
    char c = string[i];
    if(c == '<' && inside == 0) {
        if (string[i+1] == 'i'){
            if (string[i+2] == '>') {
                inside++;
                i = i+2;
            }
        }
    }
    else if(c == '<' && inside > 0) {
        //printf("first if\n");
        if (string[i+1] == '/'){
            if (string[i+2] == 'i') {
                inside--;
                i=i+3;
            }
        }
    }
    if(inside == 0) {
        result[j] = c;
        j++;
    }
}
result[j] = '\0';
return result;
}

char* strip_tables(char* string) {
//printf("%s\n", string);
long len = strlen(string);
//long len = 1000000;

char* result = malloc(len + 1);
int inside = 0;
int i, j = 0;
for(i=0; i < len; i++) {
    //printf(".%d, %c, %d\n", i, string[i], inside);
    char c = string[i];
    if(c == '<' && inside == 0) {
        if (string[i+1] == 't'){
            if (string[i+2] == 'a') {
                if (string[i+3] == 'b') {
                    if (string[i+4] == 'l') {
                        inside++;
                        i = i+4;
                    }
                }
            }
        }
    }
    else if(c == '<' && inside > 0) {
        //printf("first if\n");
        if (string[i+1] == '/'){
            if (string[i+2] == 't') {
                if (string[i+3] == 'a') {
                    if (string[i+4] == 'b') {
                        if (string[i+5] == 'l') {
                            inside--;
                            i=i+7;
                        }
                    }
                }
            }
        }
    }
    if(inside == 0) {
        result[j] = c;
        j++;
    }
}
result[j] = '\0';
return result;
}

That given a link to a wiki article will return the first link back, then in main I loop over this function till I arrive at a specified article. I ran from some random article and discovered when it passes over "Literature" it gets "Art" as the next page but when it goes to search Art curl returns a blank string- if i print("%s", chunk.memory) after the call I get (null). If I manually force the function to start at art it works fine, trailing all the way to philosophy. For the life of me I cant see any differences... I put some diagnostic printfs in and got the following-

this is the address ~> !http://en.wikipedia.org/wiki/Art!, rand flag = 0

With the link inbetween the exlamation marks, so I know it's parsing the link back properly, and rand_flag is always set to 0 at the moment.

Any tips, pointers or solutions much appreciated.


It is not generally possible to say anything about a program if all you have is an uncompilable piece of code. So I'm going to give some generic recommendations.

  1. Check return values of your functions.
  2. Set up callbacks to libcurl so that you can print every byte that goes in and out with a flip of a switch (much like curl -v does — look at its source if you need guidance).
  3. Sniff your network traffic.
  4. If you see that a request is not sent at all, or that it's sent but no data is returned, you have narrowed your problem a bit.


The code is complete borked and will not work. Let me illustrate by snipping a piece of the code and comment:

char *theString = malloc(strlen(chunk.memory)+1);

char *theString1 = malloc(strlen(theString) + 1);

Doing strlen(theString) will call strlen() on a pointer that points to uninitialized memory. Can be anything. And then you allocate that size and put in a second pointer...

theString = strstr(chunk.memory, "div id=\"body");

... and yet you assign 'theString' again to a position within the 'chunk.memory' memory area.

theString1 = strip_tables(theString);

And you assign 'theString1' again to some position within 'theString'.

You've now leaked your two mallocs.

if(chunk.memory) free(chunk.memory);

And look, you now freed the data your two pointers are pointing to. They now point to garbage.

theString = strstr(theString1, "

");

... and now you search in the data you already freed.

Do I need to say more?

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜