开发者

Problem in B-tree (not B+/B*) implemented with files in C

i'm new here and first of all, i wanna apologize if i make errors in question. My problem is: i want to implement a B-tree in C, using a file to store the tree...my program reads 10000 strings of 10 characters each from a Text file, and store that content in a .DAT binary file, organized via B-tree; then the user can search for a string.

I'm using algorithms of "Cormen,et al - Introduction to Algorithms (3ed)", which seems to be correct, clear and functional. BUT, my program just get runtime errors...like Segmentation Fault and Infinite Loop. I've been trying to debug for 5 days, but with no success! The B-tree functions are recursive, which i personally hate...i think exactly the recursion makes the debug so difficult!

My code is relatively big, and it's been divided in 2 files, one source, one header. I'll just post here the functions of B-tree and the variables declarations. But, if someone wants to see full code, i'll post an "iFile.it" link (is it allowed? Sorry if not!)...

Thanks very 开发者_如何转开发much for attention and help...sorry for the big question!

Source link [not so important,just main()] : http://ifile.it/n73drmc/b-tree_file.c

Header link [all functions are here]: http://ifile.it/u1fa3kp/b-tree_file.h

Text File with strings: http://ifile.it/7hu95ot/arq_5.txt

NOTES about code:

i) The free() are with very strange behaviour...so i've commented them, because if i use them, my program bugs even more!

ii) All code comments are my ideas to try to solve the problems, and must be considered that i've tried them.

iii) I'm a native portuguese speaker, so functions and variables could have strange names for native english speakers...sorry for that !

THE CODE:

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>

 //defs of pre-compiler

 #ifdef __linux
    #define PAUSA "read p"
    #define CLRSCR "clear"
 #else
    #define PAUSA "Pause"
    #define CLRSCR "cls"
 #endif

 #define T 5 // minimum degree of B-tree
 #define Min (T-1)
 #define Max ((2*T)-1)
 //

 //global vars
 FILE *arqt, *arqb;

 char VAL[11];
 long int lt = 0;

 typedef struct {
    unsigned int num;
    long int pos;
    char chave[(Max+1)][11]; //chave in portuguese = key in english!
    short int folha; //folha in portuguese = leaf in english!
    long int c[(Max+2)];
 } Nod;

 Nod *raiz = NULL; //raiz in portuguese = root in english!
 //

 void b_split(Nod *x, unsigned int i, Nod *y) { //B-TREE-SPLIT-CHILD //that function split a B-tree node
Nod *z = NULL;
unsigned int j=1;

z = (Nod*)realloc(z,sizeof(Nod));

fseek(arqb,0,SEEK_END);
z->pos = ftell(arqb);

z->folha = y->folha;
z->num = Min;

for(j=1;j<=Min;j++) {strcpy(z->chave[j],y->chave[j+T]);}

if (y->folha == 0) {
    for(j=1;j<=T;j++) {z->c[j] = y->c[j+T];}

}

y->num = Min;

for(j=(x->num + 1);j<=(i+1);j--) {x->c[(j+1)] = x->c[j];}

x->c[(i+1)] = z->pos;

for(j=x->num;j<=i;j--) { strcpy(x->chave[j+1],x->chave[j]); }

strcpy(x->chave[i],y->chave[T]);

x->num = x->num + 1;

fseek(arqb,x->pos,SEEK_SET);
fwrite(x,sizeof(Nod),1,arqb);

fseek(arqb,y->pos,SEEK_SET);
fwrite(y,sizeof(Nod),1,arqb);

fseek(arqb,z->pos,SEEK_SET);
fwrite(z,sizeof(Nod),1,arqb);

//free(z);
//free(y);
}

void b_ins(Nod *x, char *val) { //B-TREE-INSERT-NONFULL //insert a key in nonfull node
unsigned int i=0;
Nod *C = NULL;

i = x->num;

if (x->folha == 1) {
    while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {
        strcpy(x->chave[(i+1)],x->chave[i]);
        i--;
    }
    strcpy(x->chave[(i+1)],val);
    x->num = x->num + 1;

    fseek(arqb,x->pos,SEEK_SET);
    fwrite(x,sizeof(Nod),1,arqb);

}

else {
    while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {i--;}
    i++;

    C = (Nod*)realloc(C,sizeof(Nod));

    fseek(arqb,x->c[i],SEEK_SET);
    fread(C,sizeof(Nod),1,arqb);

    if (C->num == Max) {
        b_split(x,i,C);

        if ( strcmp(val,x->chave[i]) > 0 ) {i++;}

    }
    fseek(arqb,x->c[i],SEEK_SET);
    fread(C,sizeof(Nod),1,arqb);

    b_ins(C,val);

    //free(C);
}

}

void insere_b(char *val) { //B-TREE-INSERT //i believe the problem is here!
Nod *S = NULL,*R = NULL;

R = (Nod*)realloc(R,sizeof(Nod));
R = raiz;

if (R->num == Max) {
    S = (Nod*)realloc(S,sizeof(Nod));

/*      fseek(arqb,0,SEEK_END);
        S->pos = ftell(arqb);
*/
    raiz = S;

    S->folha = 0;
    S->num = 0;
    S->c[1] = R->pos;

 /*      fseek(arqb,S->pos,SEEK_SET);
         fwrite(S,sizeof(Nod),1,arqb);

 */
    b_split(S,1,R);
    b_ins(S,val);

    //free(S);
}

else {b_ins(R,val);}

//free(R);
}

void busca_b(Nod *x, char *val) { //B-TREE-SEARCH //self explanatory
unsigned int i=1;
Nod *C = NULL;

while( (i <= x->num) && ( strcmp(val, x->chave[i]) > 0 ) ) {i++;}

if ( (i <= x->num) && ( strcmp(val, x->chave[i]) == 0 ) ) {printf ("\nValor encontrado!\n");}

else {
    if (x->folha == 1) {printf ("\nValor NAO encontrado!\n");}

    else {
        C = (Nod*)realloc(C,sizeof(Nod));
        fseek(arqb,x->c[i],SEEK_SET);
        fread(C,sizeof(Nod),1,arqb);

        lt++;

        busca_b(C,val);
        //free(C);
    }

}

}

void cria_b() { // cria arvore B //create the B-tree
long int i = 0;
char V[11];

raiz->folha = 1;
raiz->num = 0;
raiz->pos = 0;
for (i=1;i<=(Max+1);i++) {raiz->c[i] = -1;}

fseek(arqb,raiz->pos,SEEK_SET);
fwrite(raiz,sizeof(Nod),1,arqb);

rewind(arqt);
for (i=0;i<fSize(arqt);i++) {
    fscanf(arqt,"%s\n",V);
    insere_b(V);
}
}


The code is a little hard to read but here's what I see:

R = (Nod*)realloc(R,sizeof(Nod));
R = raiz;

You are doing a memory allocation and throwing away the result right away. The result of realloc may be different than the original pointer.

You probably want a function to initialize your tree nodes to make that clearer - it's hard for me to follow the code.

I am expecting to see pointers in each node in the tree but I'm not seeing any. I'm not following your implementation. I would suggest drawing your tree on paper with arrows showing what points to what and consider all corner cases (e.g. first insert when tree is empty) as those are usually what people can get stuck on. Also step through your code with a debugger and see if it behaves the way you expect it to.

EDIT: If the entire tree is built in a file (which staring at this a bit more seems to be the intention) you probably do not need to do any dynamic memory allocation at all.

Very generally you want to:

  • Watch out for overflows, e.g. you're writing to a memory location beyond what is available/has been allocated, the string functions are susceptible to those, make sure you always have a valid zero terminated string where required and you always have enough room for your string (that is it is not longer than what you can hold).
  • Check for NULL return values from the memory allocation functions.
  • Make sure you're not leaking memory, i.e. allocating and throwing away the pointer, never calling free().
  • Make sure you're always calling free with a pointer previously allocated and not using that pointer at any later point.
  • Seems there are many places where you're doing realloc() but really want malloc().

Some C/C++ compilers can do extra checks during run time (e.g. Visual Studio) which may be useful in pinpointing the problem.

Also look at Jonathan's good comments re conventions and style.


Header files should only contain declarations of structures, typedefs, enums, defines and function declarations. You might conceivably have inline function definitions in it, but probably not.

Your header file has full functions in it - ones that are unlikely to be inlined and are not declared inline. That means it is unusable for its intended purpose - to provide declarations to files using the code defined in the corresponding C file.

The code is not easy to read - and it isn't the language that is the problem. Constructs such as:

while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {i--;}

are not idiomatic C. The normal way of writing that would be:

while (i >= 1 && strcmp(val, x->chave[i]) < 0)
    i--;

There are coding guidelines that require braces around all loop bodies and conditionals; if you suffer from one of those, then you use one of a couple of coding conventions:

while (i >= 1 && strcmp(val, x->chave[i]) < 0)
{
    i--;
}

while (i >= 1 && strcmp(val, x->chave[i]) < 0) {
    i--;
}

I strongly prefer the former; there are lots of people who prefer the latter.

There are systems other than Windows and Linux in this world.

//defs of pre-compiler
#ifdef __linux
    #define PAUSA "read p"
    #define CLRSCR "clear"
#else
    #define PAUSA "Pause"
    #define CLRSCR "cls"
#endif

This doesn't work particularly sanely for my environment - MacOS X. I'm not exactly keen on programs that use system statements, but I guess that is a problem with me being an old fogey who doesn't use and IDE (though one of the reasons I don't use an IDE is because command line programs don't run sanely in them, and I mostly write command line programs, so they are hostile to my normal mode of working).


One major performance bug is:

for (i = 0; i < fSize(arqt); i++)

This calls the fSize() function on each iteration, and the function traverses the entire file, determining how many lines are in the file, restoring the read position before returning. It is not clear that you really need to count the number of lines - you can probably simply read the lines as you go. However, if you do need to count the lines, do so just once.

int lines = fSize(arqt);
for (i = 0; i < lines; i++)

There are a number of places where you have loops using:

for (x = 1; x <= MAX; x++)

This is usually incorrect in C; array indexes start at zero and the canonical form for a loop is:

for (x = 0; x < MAX; x++)

Stylistically, you normally reserve all upper-case names for #define or enum values and not as regular variables.


In the insere_b() function, you have:

//B-TREE-INSERT //i believe the problem is here!
void insere_b(char *val)
{
    Nod *S = NULL, *R = NULL;

    R = (Nod*)realloc(R, sizeof(Nod));
    //R = raiz;

    if (R->num == Max)
    {
        S = (Nod*)realloc(S, sizeof(Nod));

Someone else pointed out that the R = raiz; line is dubious. You assign null to R; then you realloc() it. This is always equivalent to malloc(). Further, the memory allocated is not initialized, so you get random values to play with. This leads to problems.

You then go through a similar sequence with S (allocating memory via realloc()), though this time you subsequently initialize parts (possibly all) of the structure allocated.

These are sufficient to cause trouble.


When the code goes into b_split() for the first time, you run into a problem with the position values. Specifically, y->pos is zero, but so is x->pos, so the program stores (writes) two nodes worth of data at the same offset, which is seldom a recipe for happiness. Since x is the root node (at least in this context - the first split), it should be at position 0; both y and z need to be at different positions. The y node also contains non-zeroed keys, though that doesn't matter much (except for purposes of tidiness) as the y->num value indicates they are not used.

You also do not ever use the chave[0] key value, AFAICT. That is a bit wasteful.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜