/* * File : correlations.c * Author : Gavin Sherlock * Date : September 29th 1999 * Version 1.0 * * This program reads a preclustering file, and produces a file that contains a sorted list of correlations. * The user can input a cutoff (as low as .5) and a number (as many as 50) */ #include "correlations.h" /* Main */ int main(int argc, char *argv[]){ char ifile[1024]; /* Path names must be 1024 chars or less */ int numExperiments=0; int numLines=0; float *dataMatrix; clusterRec cluster; float *eWeights; FILE *istream; char **experimentNames; /* get relevant input information */ if(argc==1 || argc==0){ printf ("Enter input file: "); gets (ifile); GetUserInput(); }else{ ParseOptions(ifile, argc, argv); } /* Read in the Data */ istream=OpenInFile(ifile); GetDataSize(istream, &numExperiments, &numLines); cluster.numGenes=numLines; /* put these in experiment to cut down on number of arguments that need passing */ DoMemoryAllocation(&eWeights, numExperiments, &experimentNames, &cluster, &dataMatrix); rewind(istream); ReadInData(istream, &cluster, numExperiments, eWeights, experimentNames, dataMatrix); fclose (istream); /* transform data if necessary */ if(gLogData) LogTransformData(dataMatrix, cluster.numGenes, numExperiments); MakeCorrelations(&cluster, dataMatrix, numExperiments, eWeights, ifile); free (dataMatrix); free (eWeights); FreeExperimentNames(experimentNames, numExperiments); FreeCluster(&cluster); printf("Finished\n"); fflush(stdout); return 0; } /* Function: OpenInFile * Usage: istream=OpenInFile(ifile) * ---------------------- * This functions attempts to open a file using the name * passed into it, and reports an error if it fails, otherwise * returns a file handle for reading. */ FILE* OpenInFile(char *ifile){ FILE *istream; if ((istream = fopen(ifile, "rb")) == NULL){ /* Open the file OK? */ Error("\nError opening input\n"); /* NO, say so */ } /* end if */ return (istream); } /* * Function : GetFilePrefix * Usage : prefix=GetFilePrefix(ifile) * ----------------------------------- * This function returns the prefix to a filename, which * is all characters up to the last '.' character, or if * one is not encountered, then all characters. If a unique * identifier was passed in, via the -u option, then the directory * path, plus the UID will be returned instead. */ char *GetFilePrefix(char *ifile){ char* prefix; int letter=0; int last=0; int seen=0; int dirLet=0; while (ifile[letter]!='\0'){ if (ifile[letter]=='.'){ last=letter; seen=1; }else if (ifile[letter]=='/'){ seen=0; if (gUID) dirLet=letter; /* remember the directory path */ } letter++; if (letter==1024) Error("The name of your input file is longer than 1024 characters"); } if (seen && !gUID){ /* saw a period */ prefix=malloc((last+2)*sizeof(char)); if (prefix==NULL) Error ("Not enough room for prefix\n"); strncpy(prefix, ifile, last+1); prefix[last+1]='\0'; }else if (!gUID){ /* no period */ prefix=malloc((letter+2)*sizeof(char)); if (prefix==NULL) Error ("Not enough room for prefix\n"); strcpy(prefix, ifile); prefix[letter]='.'; prefix[letter+1]='\0'; }else{ /* a UID was passed in */ prefix=malloc((dirLet+strlen(gPrefix)+3)*sizeof(char)); if (dirLet) { strncpy(prefix, ifile, dirLet+1); prefix[dirLet+1]='\0'; strcat(prefix, gPrefix); }else{ strcpy(prefix, gPrefix); } strcat(prefix, "."); prefix[dirLet+2+strlen(gPrefix)]='\0'; } return (prefix); } /* * Function: GetUserInput * Usage: GetUserInput() * --------------------------- * This function gets input form the user, as regards whether they want to cluster * experiments, and whether they want to use a centered metric for correlation. * It also checks whether they want to partition the data (and how), whether they * want to log transform the data, and whether they want to normalize of filter the data. */ void GetUserInput(){ GetTransformationOptions(); GetGeneMetric(); GetCutOff(); GetNumCorrelations(); } /* * Function: GetTransformationOptions * Usage: * ---------------------------------- * This function checks whether they want to log transform the data */ void GetTransformationOptions(void){ char inputLine[64]; printf("\nDo the data need to be log transformed?(yes/no)\n"); CheckYesOrNo(inputLine); if ((!strcmp(inputLine, "yes"))||(!strcmp(inputLine, "YES"))){ gLogData=1; } } /* * Function: GetGeneMetric * Usage: GetGeneMetric() * ------------------------ * This function checks whether they want to use a centered or * uncentered metric for the genes, if pearson was chosen as the * distance metric. */ void GetGeneMetric(void){ char inputLine[64]; printf("Do you want to use a correlation with a centered metric (yes/no)? \n"); CheckYesOrNo(inputLine); if ((!strcmp(inputLine, "yes"))||(!strcmp(inputLine, "YES"))){ gCentered=1; } } /* * Function: GetCutOff * Usage: GetCutoff() * ------------------- * This functions finds out what cutoff the user wants. If they enter less than 0.2 * it will use 0.2. The default is 0.8 */ void GetCutOff(void){ char inputLine[64]; printf("What do you want to use as a cutoff?\n"); printf("(Enter a value greater than 0.2, or hit return to use the default of 0.8)"); gets(inputLine); if (!strcmp(inputLine, "")){ /* they just hit return, so use default */ }else{ gCutOff=StringToReal(inputLine); if (gCutOff < 0.2){ printf("You entered a cutoff less than 0.2. .2 will be used as a cutoff\n"); gCutOff = 0.2; } if (gCutOff > 1) Error("You chose a cutoff greater than 1"); } } /* * Function: GetNumCorrelations * Usage : GetNumCorrelations(); * ----------------------------- * This function finds out the number of correlations that the person wants to store. * The default is 20, and they can have up to 50. */ void GetNumCorrelations(void){ char inputLine[64]; printf("How many correlations do you want?\n"); printf("(Enter a number, or hit return to get the default of 20)"); gets(inputLine); if (!strcmp(inputLine, "")){ /* they just hit return, so use default */ }else{ gMaxNumCorrelations=(int)StringToReal(inputLine); if (gMaxNumCorrelations < 1) Error("You chose a value less than 1"); } } /* * Function: GetYesOrNo * Usage: * -------------------- * This function simply asks for an input, and checks whether it is yes or no. * It stays continues prompting for a yes or no until one is received. */ void CheckYesOrNo(char *inputLine){ while(1){ gets(inputLine); if (!((!strcmp(inputLine, "yes"))||(!strcmp(inputLine, "YES"))||(!strcmp(inputLine, "no")) ||(!strcmp(inputLine, "NO")))){ printf("Please answer yes or no\n"); }else{ break; } } } /* * Function: ParseOptions * Usage: ParseOptions(ifile, argc, argv) * ---------------------------------------------- * This function parses supplied command line options. * Command line options are as follows: * -f filename * -g 1|2 ; 1 indicates non-centered metric, 2 indicates centered metric. 1 is default. * -e 0|1|2 ; 0 indicates no experiment clustering, see above for 1 and 2. 0 is the default. * The arguments can be passed in any order. */ void ParseOptions(char *ifile, int argc, char** argv){ int i; int foundFileName=0; int length; if (!(strcmp("-h", argv[1]))){ /* they want help */ Usage(); exit(0); /* NOTE EXITING PROGRAM HERE!!!! */ } if ((argc-1)%2)Error("Incorrect number of command line arguments"); for(i=1; i 1) Error("You chose a cutoff greater than 1"); }else if (!(strcmp("-num", argv[i]))){ /* number of correlations that they want */ gMaxNumCorrelations=(int)StringToReal(argv[i+1]); if (gMaxNumCorrelations < 1) Error("You chose a value less than 1"); }else if (!(strcmp("-showCorr", argv[i]))){ /* indicating whether they want the correlations */ gShowCorrelations = (int)StringToReal(argv[i+1]); if (gShowCorrelations > 1 || gShowCorrelations < 0){ Error("You have chosen an out of range value for defining whether to show correlations or not."); } }else{ Error("Unrecognized command line option. Only -f, -corr, -cutoff, -num, -l and -u are valid (currently!)"); } } if(!foundFileName)Error("You passed in command line arguments without specifying a filename"); } /* * Function: Usage * Usage : Usage() * --------------- * This functions prints out the command line arguments that may be used with the program, * and what they mean. */ void Usage(void){ printf("\n\nThe program \"correlations\" will take a preclustering file as an input, and produce a\n"); printf("file containing the correlations for each gene in sorted order.\n"); printf("The output file will be named with the same stem as the input file, but with a .stdCor suffix\n\n"); printf("Usage:\n"); printf("The following command line arguments may be used:\n\n"); printf("-f Allows you to specify the preclustering filename. Relative paths may be used\n\n"); printf("-corr 1|2 Allows you to specify whether you want an uncentered (1) or a centered (2) metric.\n"); printf(" 1 is the default\n\n"); printf("-cutoff Allows you to specify a cutoff, correlations above which will be stored\n\n"); printf("-num Allows you to specify the number of correlations that you would like to store\n"); printf(" 20 is the default\n\n"); printf("-l 0|1 Allows you to specify if you want to log transform the data (1)\n"); printf(" 0 is the default\n\n"); printf("-u Allows you to specify a unique id by which you output file will be named\n"); printf(" eg. correlations -f sample.pcl -u 888\n"); printf(" will produce an output file named 888.stdCor\n\n"); printf("-showCorr 0|1 specifies whether you want to see thhe correlations themselves.\n"); printf(" 1 is the default\n\n"); printf("Questions or comments should be addressed to sherlock@genome.stanford.edu\n\n"); } /* * Function: OpenOutFile * Usage: outfile=OpenOutFile("outfile.txt") * ----------------------------------------- * This functions attempts to open a file using the name * passed into it, and reports an error if it fails, otherwise * returns a file handle for writing. */ FILE* OpenOutFile(char *ofile){ FILE *ostream; if ((ostream = fopen(ofile, "w")) == NULL){ /* Open the file OK? */ Error("\nError opening output file: %s\n", ofile); /* NO, say so */ } /* end if */ return (ostream); } /* * Function: OpenForAppend * Usage: outfile=OpenForAppend("outfile.txt") * ----------------------------------------- * This functions attempts to open a file for appending using the name * passed into it, and reports an error if it fails, otherwise * returns a file handle for appending. */ FILE* OpenForAppend(char *ofile){ FILE *ostream; if ((ostream = fopen(ofile, "a")) == NULL){ /* Open the file OK? */ Error("\nError opening output file: %s\n", ofile); /* NO, say so */ } /* end if */ return (ostream); } /* Function: GetDataSize * Usage:GetDataSize(istream, &numExperiments, &numLines) * -------------------------------------------------- * This functions parses the data file, to determine the number * of genes and experiments represented therein. Because the * variables are passed in by address then there is no return argument. */ void GetDataSize(FILE *istream, int *numExperiments, int *numLines){ int nextbyte; int extraChar; printf("Getting size of data...\n"); fflush(stdout); while ((nextbyte = fgetc(istream))){ /* get characters from input file */ if (nextbyte=='\t'){ /* count tabs to get number of columns */ (*numExperiments)++; } /* the next bit looks for the end of line, whether it's PC ('\r\n'), * Mac ('\r') or UNIX ('\n'). It checks to see if the following character * has to do with the end of line also. If not, it puts it back in the stream. */ if ((nextbyte=='\r') || (nextbyte=='\n')){ /* look for end of line */ extraChar=fgetc(istream); if (extraChar!='\n' && extraChar!=EOF){ ungetc(extraChar, istream); } break; /* only reading first line to get number of columns */ } } (*numExperiments)-=2; /* the first two tabs didn't precede data so decrease the number */ while((nextbyte = fgetc(istream)) != EOF){ /* Read char.s until end of file */ switch (nextbyte) /* What char was read? */ { case '\r': /* on the Mac or the PC, \r will be the first thing seen to indicate an end of line */ (*numLines)++; extraChar=fgetc(istream); /* get rid of extra character if needs be */ if (extraChar!='\n' && extraChar!=EOF){ ungetc(extraChar, istream); } break; case '\n': /* on UNIX \n will be seen as the end of line */ (*numLines)++; break; default: break; } /* end switch */ } /* end while */ (*numLines)--; /* because there's an extra line in the file */ } /* * Function: DoMemoryAllocation * Usage: DoMemoryAllocation(&eWeights, numExperiments, &experimentNames, &cluster, &dataMatrix); * ------------------------------------------------------------------------------------------------------------------- * This function does the DMA that is needed for all the data structures that are required, * including the dataMatrix, the experiment names, and the nameRecs. */ void DoMemoryAllocation(float **eWeights, int numExperiments, char ***experimentNames, clusterRec *cluster, float **dataMatrix){ nameRec *namePtr; *eWeights=malloc(numExperiments*sizeof(float)); if (*eWeights==NULL) Error("Not enough memory available for eWeights"); *experimentNames=malloc(numExperiments*sizeof(char*)); if (*experimentNames==NULL) Error ("No memory available for experimentNames"); (*cluster).genes=malloc(cluster->numGenes * sizeof(nameRec)); /* allocate the memory for the name records */ if ((*cluster).genes==NULL) Error("No memory available for nameRecs"); /* now initialize the genes array */ for (namePtr=&(cluster->genes[0]); namePtr<&(cluster->genes[cluster->numGenes]); namePtr++){ InitializeArray(namePtr); } /* allocate dataMatrix */ *dataMatrix=malloc(numExperiments*(cluster->numGenes)*sizeof(float)); if (*dataMatrix==NULL) Error("No memory available for dataMatrix"); } /* * Function: ReadInData * Usage: ReadInData(istream, &cluster, numExperiments, eWeights, numLines, &experimentNames, dataMatrix) * ------------------------------------------------------------- * This functions parses the data file, to retrieve the log transformed data. * it also retrieves the eWeights, that form the second line of the file. */ float *ReadInData(FILE *istream, clusterRec *cluster, int numExperiments, float *eWeights, char **experimentNames, float *dataMatrix){ char buffer[1024]; char nextbyte; char extraChar; int currCol=0; int letter=0; int dataCol=0; int currLine=0; printf("Reading Data...\n"); fflush(stdout); /* parse the first line for the experiment names * should really consolidate code for parsing first and second lines * by switching again, dependent on the lineNumber, 1 or 2 */ while ((nextbyte=fgetc(istream))){ if (nextbyte=='\t'){ buffer[letter]='\0'; switch (++currCol){ /* depends what column we're looking at as to what we'll do */ case 1: case 2: case 3: letter=0; break; default: /* other columns must be data columns */ if (letter!=0){ experimentNames[dataCol]=malloc((letter+1)*sizeof(char)); }else{ Error("An experiment is unnamed at data column %d\n", dataCol+1); } if (experimentNames[dataCol]==NULL) Error("Not enough memory for experiment name %d", dataCol+1); strcpy(experimentNames[dataCol], buffer); letter=0; dataCol++; break; } /* end switch */ }else if (nextbyte=='\r' || nextbyte=='\n'){ buffer[letter]='\0'; if (letter!=0){ experimentNames[dataCol]=malloc((letter+1)*sizeof(char)); }else{ Error("An experiment is unnamed at data column %d\n", dataCol+1); } if (experimentNames[dataCol]==NULL) Error("Not enough memory for experiment name %d", dataCol+1); strcpy(experimentNames[dataCol], buffer); extraChar=fgetc(istream); /* get rid of extra character if needs be */ if (extraChar!='\n' && extraChar!=EOF){ ungetc(extraChar, istream); } break; }else{ if (letter>=1024) Error("An experiment name longer than 1024 bytes exists in the file on the first line."); buffer[letter++]=nextbyte; } } letter=0; dataCol=0; currCol=0; /* Now read in the eWeights line */ while ((nextbyte=fgetc(istream))){ if (nextbyte=='\t'){ buffer[letter]='\0'; switch (++currCol){ case 1: case 2: case 3: letter=0; break; default: /* other columns must be data columns */ eWeights[dataCol]=StringToReal(buffer); letter=0; dataCol++; break; } /* end switch */ }else if (nextbyte=='\r' || nextbyte=='\n'){ buffer[letter]='\0'; eWeights[dataCol]=StringToReal(buffer); extraChar=fgetc(istream); /* get rid of extra character if needs be */ if (extraChar!='\n' && extraChar!=EOF){ ungetc(extraChar, istream); } break; }else{ if (letter>=1024) Error("A token longer than 1024 bytes exists in the file on the second line."); buffer[letter++]=nextbyte; } } for (currLine=0; currLinenumGenes; currLine++){ ReadOneLine(istream, dataMatrix, numExperiments, currLine, &(cluster->genes[currLine])); } printf("Done reading data...\n"); fflush(stdout); return (dataMatrix); } /* * Function: InitializeArray * Usage: InitializeArray(&(names[count])); * ---------------------------------------- * This simply initializes the fields of each nameRec to be zero or NULL */ void InitializeArray(nameRec *names){ names->orf=NULL; names->name=NULL; names->joined=0; names->numCorrelations=0; names->last=NULL; names->first=NULL; } /* Function: ReadOneLine * Usage: ReadOneLine(istream, dataMatrix, &numDataPoints, numExperiments, currLine, &(names[currLine])) * --------------------------------------------------- * This function is called from ReadInData, and will parse one * line of data, depositing the data into dataMatrix, the space for * which was allocated in the ReadInData function. */ void ReadOneLine(FILE *istream, float *dataMatrix, int numExperiments, int currLine, nameRec *names){ char buffer[1024]; int currCol=0; int dataCol=0; int tabCount=0; /* if two tabs next to each other, then a null value exists */ char nextbyte; char extraChar; int letter=0; /* initialize gene records */ while ((nextbyte=fgetc(istream))!=EOF){ switch (nextbyte){ case '\t': tabCount++; if (dataCol>=numExperiments) Error("There are more columns of data than expected on data line %d. Only %d columns were expected.\n", currLine+1, numExperiments); if (tabCount==2){ /* a null data point exists, or there's a missing name */ if (currCol>2){ dataMatrix[currLine*numExperiments+dataCol]=NODATA; currCol++; dataCol++; tabCount=1; letter=0; break; }else{ tabCount=0; } } buffer[letter]='\0'; switch (currCol++){ case 0: /* First column is the ORF name */ if (letter>0){ names->orf=malloc((letter+1)*sizeof(char)); if (names->orf==NULL) Error("Not Enough Memory Available for orfs\n"); strcpy(names->orf, buffer); }else{ /*printf("Missing Name\n");*/ } letter=0; break; case 1: /* Second column is the SGD name */ if (letter>0){ names->name=malloc((letter+1)*sizeof(char)); if (names->name==NULL) Error("Not Enough Memory Available for names\n"); strcpy(names->name, buffer); }else{ /*printf("Missing Name\n");*/ } letter=0; break; case 2: /* Third column is the row weight */ names->rowWeight=(float)(StringToReal(buffer)); letter=0; break; default: /* other columns must be data columns */ dataMatrix[currLine*numExperiments+dataCol]=(float)(StringToReal(buffer)); letter=0; dataCol++; break; } break; case '\r': case '\n': extraChar=fgetc(istream); /* get rid of extra character if needs be */ if (extraChar!='\n' && extraChar!=EOF){ ungetc(extraChar, istream); } tabCount++; if (dataCol>=numExperiments) Error("There are more columns of data than expected on data line %d. Only %d columns were expected, but %d were found.\n", currLine+1, numExperiments, dataCol+1); if(dataCol=1024) Error("A token longer than 1024 bytes exists in the file on line %d", currLine); buffer[letter++]=nextbyte; tabCount=0; break; } } } /* * Function: StringToReal * Usage: s=StringToReal(buffer) * --------------------------- * This function converts a string representing a real number * into its corresponding value. If the string is not a legal * floating point number, or it contains extraneous characters, * StringToReal signals an Error condition. This code was taken * from Eric Roberts' book "The Art and Science of C". */ double StringToReal(char *s){ double result; char dummy; if (s==NULL) Error("NULL string passed to StringToReal"); if (sscanf(s, " %lg %c", &result, &dummy) !=1){ Error("StringToReal called on illegal number %s", s); } return (result); } /* * Function: MakeCorrelations * Usage: MakeCorrelations(&cluster, dataMatrix, numExperiments, eWeights, numGenes, 'G') * MakeCorrelations(&cluster, dataMatrix, numExperiments, eWeights, numExperiments, 'E') * -------------------------------------------------- * This function generates the correlation coefficients * by comparing each profile to every other profile. * It cuts the work in half by not comparing A vs B, and * B vs A, which should be identical. It will save up to gMaxNumCorrelations * of the top correlations. This allows correlation scores to * be freed once the node has been joined. The scores must be in * sorted order. There are pointers to both the top and bottom scores, * so that it can rapidly be decided whether a correlation is the highest, * or whether a newly calculated correlation should be included. * By passing in different "total", and "type", the function can be made to * calculate correlations for experiments, or genes. */ void MakeCorrelations(clusterRec *cluster, float *dataMatrix, int numExperiments, float *eWeights, char *ifile){ int counter; int comparedToCounter; float pearsonCorrelation; FILE *outfile; int numAfterWhichToPrint=400000/cluster->numGenes; char *filename; printf("Making correlations\n"); fflush(stdout); MakeFileName(ifile, &filename); outfile = OpenOutFile(filename); for (counter=0; counternumGenes-1; counter++){ for (comparedToCounter=counter+1; comparedToCounternumGenes; comparedToCounter++){ pearsonCorrelation=CalculateCorrelation(&dataMatrix[counter*numExperiments], &dataMatrix[comparedToCounter*numExperiments], numExperiments, eWeights); /* Once we have the correlation coefficient we have to check * if it's greater than the last value in the linked list * for BOTH the geneCounter gene, and the comparedToCounter * gene. This way every linked list will contain its top * five most similar correlations. If you only checked the * geneCounter gene, then it is possible (it does happen, I * checked) that you would not be storing the top correlations * for some genes anywhere. */ CheckToInsert(cluster, counter, comparedToCounter, pearsonCorrelation); CheckToInsert(cluster, comparedToCounter, counter, pearsonCorrelation); } /* now print out relevant correlations for gene, then free those correlations */ fprintf(outfile, "%s", cluster->genes[counter].orf); PrintOneGene(cluster->genes[counter].first, outfile, cluster); FreeCorrelations(&(cluster->genes[counter].first)); if (!(counter%numAfterWhichToPrint)){ printf ("%d\n", counter); fflush(stdout); } } /* print correlations for final gene */ fprintf(outfile, "%s", cluster->genes[counter].orf); PrintOneGene(cluster->genes[counter].first, outfile, cluster); FreeCorrelations(&(cluster->genes[counter].first)); printf("Done Making Correlations\n"); fflush(stdout); fclose(outfile); free(filename); } /* * Function: CalculateCorrelation * Usage: pearsonCorrelation=CalculateCorrelation(geneCounter, comparedToCounter, dataMatrix, numExperiments, eWeights, type, cluster->numGenes) * ------------------------------------------------------------------------------------------------------------------------ * This function calculates the correlation between two expression profiles, whose identities are * indicated by geneCounter and comparedToCounter. The algorithm calculates the pearson correlation * and only utilizes data where the two profiles have data (experiments) in common. It is able * to keep running totals, so that it does not have to go over the datasets once again after it * has found which experiments are in common between two profiles. This code is taken almost verbatim * from Mike Eisen's original correlation code. One bug in that code has been fixed (the original * code wrongly used numExperiments, instead of Count, in the final calculation) and the data access looks * somewhat different to reflect the nature of the way that I have stored the data in what was a * dynamically allocated array. I have used pointers all the time to access the array data. It looks * ugly and confusing, as I'm updating the pointers themselves, rather than a separate variable. It is, * however, far quicker (~35%!). */ float CalculateCorrelation(float *genePtr, float *cmpPtr, int numExperiments, float *colPtr){ float Sum1, Sum2, Sum11, Sum22, Sum12, Ave1, Ave2, Count, Corr, norm; register float colWeight; register float geneVal; register float cmpVal; float *max=genePtr+numExperiments; Sum1 = Sum2 = Sum11 = Sum22 = Sum12 = Count = Ave1 = Ave2 = 0; Corr=-1.0; for(; genePtr0){ Corr = (Sum12 - Sum1 * Ave2 - Sum2 * Ave1 + Count * Ave1 * Ave2) /norm; } }else{ norm=sqrt(Sum11*Sum22); if (norm>0){ Corr=(Sum12/norm); } } }else{ Corr=0; } return (Corr); } /* * Function: CheckToInsert * Usage: CheckToInsert(cluster, geneCounter, comparedToCounter, pearsonCorrelation); * ------------------------------------------------------------------------------------- * This function checks whether a correlation that was just made is * worthy of insertion into a list of correlations, based on it's value. * if so it makes a new record, which contains a number that indicates * the gene to which the geneCounter gene is correlated to, then calls * a function to insert it. It will also check whether there are enough * correlations already in the list, and delete the lowest one if appropriate. */ void CheckToInsert(clusterRec *cluster, int geneCounter, int comparedToCounter, float pearsonCorrelation){ correlationRec *newOne; if (pearsonCorrelation > gCutOff && ((cluster->genes[geneCounter].last==NULL)|| (pearsonCorrelation > cluster->genes[geneCounter].last->corr)|| (cluster->genes[geneCounter].numCorrelationsgenes[geneCounter].numCorrelations==gMaxNumCorrelations){ cluster->genes[geneCounter].last->corr=pearsonCorrelation; cluster->genes[geneCounter].last->ORFnumber=comparedToCounter; cluster->genes[geneCounter].last=SwitchLast(&(cluster->genes[geneCounter].first), cluster->genes[geneCounter].last); }else{ newOne=MakeNewRecord(pearsonCorrelation, comparedToCounter); InsertSorted(&(cluster->genes[geneCounter].first), newOne); if (newOne->next==NULL){ cluster->genes[geneCounter].last=newOne; } ++cluster->genes[geneCounter].numCorrelations; } } } /* * Function: SwitchLast */ correlationRec *SwitchLast(correlationRec **list, correlationRec* newOne){ correlationRec *curr, *prev; prev=NULL; for (curr=*list; curr->next!=NULL; curr=curr->next){ if (curr->corrcorr) break; /* We passed it, so now want to insert here */ prev=curr; } /* now curr points to the entry to delete (we already actually knew this) * and prev points to the entry that will now be the end of the list */ newOne->next=curr; if (prev!=NULL){ prev->next=newOne; }else{ *list=newOne; } for(;curr->next!=newOne;curr=curr->next){} curr->next=NULL; return curr; } /* * Function: MakeNewRecord * Usage: MakeNewRecord(pearsonCorrelation, comparedToCounter) * ----------------------------------------------------------- * This function allocates space for a new correlationRec * and initializes its fields */ correlationRec *MakeNewRecord(double correlation, int geneNumber){ correlationRec *newOne; newOne=malloc(sizeof(correlationRec)); if (newOne==NULL) Error("Not enough memory available for a new correlationRec"); newOne->corr=correlation; newOne->ORFnumber=geneNumber; newOne->next=NULL; return (newOne); } /* * Function: InsertSorted * Usage: InsertSorted(&(cluster->genes[geneCounter]), cp) * ---------------------------------------------------------- * This function will insert a correlationRec in order (with * respect to the value of the correlation it contains). It * handles the special case of their being no previous correlationRecs * in the list. This is why the list has to be passed in as a double * pointer. */ void InsertSorted(correlationRec **list, correlationRec *newOne){ correlationRec *prev, *curr; prev=NULL; for (curr=*list; curr!=NULL; curr=curr->next){ if (curr->corrcorr) break; /* We passed it, so now want to insert here */ prev=curr; } /* now, "prev" points to the one before where the newOne will be inserted, next "curr" points to the one after */ newOne->next=curr; if (prev !=NULL){ prev->next=newOne; }else{ *list = newOne; } } /* * Function: DeleteLast * Usage: cluster->genes[geneCounter].last=DeleteLast(cluster->genes[geneCounter].first) * ------------------------------------------------------- * This function is invoked when the last correlation in a list is to a gene * that has just been joined into a node. Hence the correlationRec associated * with that correlation is deleted, and the one in the list previous to it * is returned as now being at the end of the list. */ correlationRec *DeleteLast(correlationRec **list){ correlationRec *curr, *prev; prev=NULL; for (curr=*list; curr->next!=NULL; curr=curr->next){ prev=curr; } /* now curr points to the entry to delete (we already actually knew this) * and prev points to the entry that will now be the end of the list */ prev->next=NULL; free (curr); return prev; } /* * Function: FreeCluster * Usage: FreeCluster(cluster) * --------------------------------- * This function frees the memory associated with a particular dataset, * including all ORF and genes names. */ void FreeCluster(clusterRec *cluster){ int counter; for (counter=0; counternumGenes; counter++){ if (cluster->genes[counter].orf!=NULL) free(cluster->genes[counter].orf); if (cluster->genes[counter].name!=NULL) free(cluster->genes[counter].name); } free (cluster->genes); } /* * Function: FreeCorrelations * Usage: FreeCorrelations(&(cluster->genes[node1].first)) * ---------------------------------------------------------- * This function walks through a linked list freeing the memory * associated with each stored correlation. */ void FreeCorrelations(correlationRec **node){ correlationRec *curr, *next; for (curr=*node; curr!=NULL;){ next=curr->next; free(curr); curr=next; } } /* * Function: MakeFile * Usage: MakeFileNames(ifile, &fileName) * ----------------------------------------------------------------------- * This function simply determines from the name of the input file, what the * output files should be called. */ void MakeFileName(char *ifile, char **fileName){ char* prefix; prefix=GetFilePrefix(ifile); *fileName=malloc((strlen(prefix)+7)*sizeof(char)); if(*fileName==NULL)Error("Not enough memory for filenames"); strcpy(*fileName, prefix); strcpy(*fileName+strlen(prefix), "stdCor\0"); free(prefix); } /* * Function: Error * Usage: Error(msg, ...) * ---------------------- * This function generates an error string, expanding % constructions * appearing in the error message string just as printf does. * After printing the error message, the program terminates, * This code was taken from Eric Roberts' "The Art and Science of C". */ void Error(char *msg, ...){ va_list args; va_start(args, msg); fprintf(stderr, "Error: "); vfprintf(stderr, msg, args); fprintf(stderr, "\n"); va_end(args); exit(1); } /* Function: FreeExperimentNames * Usage: FreeExperimentNames(experimentNames, numExperiments) * -------------------------------------------------------------- * This function frees the storage associated with keeping track of the experiment names */ void FreeExperimentNames(char **experimentNames, int numExperiments){ int i; for (i=0; igenes[counter].first, outfile) * -------------------------------------------------------------- * This function will print out the correlations to a particular gene, * The output is tab delimited, and consists of both the gene name, * and the correlation value itself. */ void PrintOneGene(correlationRec *list, FILE *outfile, clusterRec *cluster){ correlationRec *curr; for (curr=list; curr!=NULL; curr=curr->next){ fprintf(outfile, "\t%s", cluster->genes[(curr->ORFnumber)].orf); if (gShowCorrelations){ fprintf(outfile, "\t%f", curr->corr); } } fprintf(outfile, "\n"); } /* * Function : LogTransformData * Usage : LogTransformData(dataMatrix, numGenes, numExperiments) * -------------------------------------------------------------- * This function transforms all the data in the dataMatrix into * into log base2 */ void LogTransformData(float *dataMatrix, int numGenes, int numExperiments){ int gene; int experiment; float log2=log(2); printf("Log Transforming Data\n"); for (experiment=0; experiment