/* Fast sorting and merging for XAO::Indexer * * Andrew Maltsev, , 2004 */ #define PERL_NO_GET_CONTEXT #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include #include #include #include /* If that version of perl does not have pTHX_ macros then defining them here */ #ifndef pTHX_ #define pTHX_ #endif #ifndef aTHX_ #define aTHX_ #endif /************************************************************************/ #define TREE_DEPTH (5) #define LEAF_SIZE (0x80000000>>(TREE_DEPTH*4-1)) #define MAX_ISECT 100 /************************************************************************/ union tree_node { union tree_node *nodes; U32 *data; }; static union tree_node tree_root[16]; /************************************************************************/ static void tree_print(union tree_node *branch, U8 level) { U8 spnum=(level+1)*2; U8 i; for(i=0; i<16; ++i) { if(!branch[i].nodes) continue; fprintf(stderr,"%*s:idx=%x level=%u\n",spnum,".",i,level); if(level> 28) & 0xf; union tree_node *node=branch+idx; //printf("pos=%lu value=%08lx level=%u idx=%u\n",pos,value,level,idx); if(levelnodes) { Newz(0,node->nodes,16,union tree_node); } tree_store(node->nodes,pos,value,level+1); } else { U32 *data=node->data; if(!node->data) { New(0,data,LEAF_SIZE,U32); memset(data,0xff,LEAF_SIZE*sizeof(U32)); node->data=data; } data[value&(LEAF_SIZE-1)]=pos; } } /* Only clears data blocks without even freeing their memory, since if * we're going to re-use the index -- it will most likely contain the same * ID's, only re-ordered. */ static void tree_clear(union tree_node *branch, U8 level) { U8 i; for(i=0; i<16; ++i) { if(level> 28) & 0xf; if(!node) { return 0xffffffff; } if(i==TREE_DEPTH-1) { U32 *data=node[idx].data; if(!data) return 0xffffffff; return data[value & (LEAF_SIZE-1)]; } else { node=node[idx].nodes; //printf("i=%u idx=%u node=%p\n",i,idx,node); } } return 0xffffffff; } static void tree_free(union tree_node *branch, U8 level) { U8 i; if(level==0) //printf("Freeing branch %p, level %u\n",branch,level); for(i=0; i<16; ++i) { //printf("Freeing level=%x, i=%x\n",level,i); if(levelpb ? 1 : (pa sizeof(cursors)/sizeof(*cursors)) { list_num=sizeof(cursors)/sizeof(*cursors); } // First sorting lists so that the shortest is first as obviously // intersection can't be longer then the shortest list. // // We need to sort two short arrays in place at the same time, so // doing it bubble-like way. // //// printset("Initial",list_num,lists,sizes); for(i8=0; i8jsz) { sz=jsz; pos=j; } } if(pos!=i8) { U32 *p=lists[pos]; lists[pos]=lists[i8]; lists[i8]=p; sizes[pos]=sizes[i8]; sizes[i8]=sz; } } //// printset("Sorted",list_num,lists,sizes); cursors[list_num-1]=0; // Now keeping positions in the lists and going through them // building resulting set in-place in the first position -- both for // size and data. // base_size=sizes[0]; res_size=0; for(i=0; i=list_size) { //printf("....no match!\n"); break; } } if(j>=list_num) { if(res_size!=i) { lists[0][res_size]=base_val; } ++res_size; //printf("....global match on %lu, res_size=%lu\n",base_val,res_size); } } sizes[0]=res_size; //// printset("Intersection",list_num,lists,sizes); } /************************************************************************/ static void printset_pos(char const *str, U8 list_num, U8 *wnums, U32 **lists, U32 *sizes) { U8 i; fprintf(stderr,"%s:\n",str); for(i=0; i MAX_ISECT) { list_num=MAX_ISECT; } // First sorting lists so that the shortest is first. For positions // lists the length only gives a rough estimate, but it's better then // nothing. // for(i8=0; i8jsz) { sz=jsz; pos=j; } } if(pos!=i8) { U32 *p=lists[pos]; lists[pos]=lists[i8]; lists[i8]=p; sizes[pos]=sizes[i8]; sizes[i8]=sz; j=wnums[pos]; wnums[pos]=wnums[i8]; wnums[i8]=j; } } cursors[list_num-1]=0; //printset_pos("Sorted",list_num,wnums,lists,sizes); // Now keeping positions in the lists and going through them // building resulting set in-place in the first position -- both for // size and data. // base_size=sizes[0]; base_list=lists[0]; res_size=0; for(i=0; i=list_size) { //printf("....no match!\n"); break; } } //// // If we made it through all lists then all cursors point at the // data in each list and there is that ID in every list. // Searching for words in sequence all in the same field. // if(j>=list_num) { U8 base_wnum=wnums[0]-1; //printf("MATCH: base_wnum=%u\n",base_wnum); for(++i; i=list_size || !fn) { //printf("..NOT FOUND: k=%lu, fn=%lu\n",k,fn); break; } // We end up here only if we found a match in this list // FOUND: //printf("..found: j=%u, k=%lu, list[k]=%lu\n",j,k,list[k]); } //// // If all other lists were scanned and matched, then // it's a match and we store it and advance cursors to the // next ID position. If we hit end on any cursors -- // that's it, no more data -- returning what we got. // if(j>=list_num) { //printf(".final found, base_val=%lu!\n",base_val); base_list[res_size++]=base_val; for(j=1; j=sizes[j]) { //printf("No point in looking beyond that\n"); goto FINAL; } //printf("Adjusting cursors[%u] from %lu(%lu) to %lu(%lu)\n",j,cursors[j],lists[j][cursors[j]],k,lists[j][k]); cursors[j]=k; } } } } while(iMAX_ISECT) { fprintf(stderr,"sorted_intersection_do - to many lists (>%u)",MAX_ISECT); avl=MAX_ISECT; } for(i=0; iMAX_ISECT) { fprintf(stderr,"sorted_intersection_pos - to many words (>%u)",MAX_ISECT); wnums_l=MAX_ISECT; } for(i=0; i