Home > external > kegg > constructMultiFasta.m

constructMultiFasta

PURPOSE ^

constructMultiFasta

SYNOPSIS ^

function constructMultiFasta(model,sourceFile,outputDir)

DESCRIPTION ^

 constructMultiFasta
   Saves one file in FASTA format for each reaction in the model that has genes

   Input:
   model         a model structure
   sourceFile    a file with sequences in FASTA format
   outputDir     the directory to save the resulting FASTA files in

   The source file is assumed to have the format '>gene identifier
   additional info'. Only the gene identifier is used for matching. This is
   to be compatible with the rest of the code that retrieves information
   from KEGG.

 Usage: constructMultiFasta(model,sourceFile,outputDir)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function constructMultiFasta(model,sourceFile,outputDir)
0002 % constructMultiFasta
0003 %   Saves one file in FASTA format for each reaction in the model that has genes
0004 %
0005 %   Input:
0006 %   model         a model structure
0007 %   sourceFile    a file with sequences in FASTA format
0008 %   outputDir     the directory to save the resulting FASTA files in
0009 %
0010 %   The source file is assumed to have the format '>gene identifier
0011 %   additional info'. Only the gene identifier is used for matching. This is
0012 %   to be compatible with the rest of the code that retrieves information
0013 %   from KEGG.
0014 %
0015 % Usage: constructMultiFasta(model,sourceFile,outputDir)
0016 
0017 sourceFile=char(sourceFile);
0018 outputDir=char(outputDir);
0019 if ~isfile(sourceFile)
0020     error('FASTA file %s cannot be found',string(sourceFile));
0021 end
0022 
0023 fprintf('Scanning the source multi-FASTA file... ');
0024 %Open the source file
0025 fid = fopen(sourceFile, 'r');
0026 
0027 %Since the list of genes will be accessed many times I use a Java hashtable
0028 hTable = java.util.Hashtable;
0029 
0030 for i=1:numel(model.genes)
0031     hTable.put(model.genes{i}, i);
0032 end
0033 
0034 %First go through the source file and save the position (in bytes) of the
0035 %start of each gene
0036 elementPositions=zeros(5000000,1,'int64'); %Make room for 5000000 elements
0037 totalElements=0;
0038 whereAmI=0; %Keeps track of where in the file we are
0039 while 1
0040     %Read 10 mb at a time
0041     str=fread(fid,10000000,'int8');
0042     
0043     %Find any '>' which indicates a new label in FASTA format
0044     newPosts=find(str==62); %62 is '>'
0045     
0046     elementPositions(totalElements+1:totalElements+numel(newPosts))=whereAmI+newPosts;
0047     
0048     totalElements=totalElements+numel(newPosts);
0049     
0050     whereAmI=whereAmI+10000000;
0051     
0052     if feof(fid)
0053         break;
0054     end
0055 end
0056 elementPositions=elementPositions(1:totalElements);
0057 fprintf('COMPLETE\n');
0058 
0059 fprintf(['NOTICE: If Matlab is freezing and does not provide any output in 30 minutes, consider increasing Java Heap Memory\n', ...
0060     'in MATLAB settings and start over with the new session\n']);
0061 fprintf('Mapping genes to the multi-FASTA source file...   0%% complete');
0062 %Now loop through the file to see which genes are present in the gene list
0063 %and save their position IN elementPositions! This is to enable a easy way
0064 %to get the distance to the following element
0065 genePositions=zeros(numel(model.genes),1);
0066 for i=1:numel(elementPositions)
0067     fseek(fid,elementPositions(i),-1);
0068     str=fread(fid,[1 30],'*char'); %Assumes that no ID is longer than 20 characters
0069     delim=find(str==32 | str==10,1,'first'); %Space or line feed
0070     
0071     geneIdentifier=str(1:delim-1);
0072     
0073     %This should never happen, but just to prevent errors. Could be that
0074     %'>' is a part of some gene information. An alternative would be to
0075     %check that the indexes follows a line feed
0076     if isempty(geneIdentifier)
0077         continue;
0078     end
0079     
0080     %If not found it means that the id was too long
0081     if isempty(delim)
0082         EM='Too long gene identifier, increase read length';
0083         dispEM(EM);
0084     end
0085     
0086     %See if the gene was found
0087     id=hTable.get(geneIdentifier);
0088     
0089     if any(id)
0090         if genePositions(id)==0
0091             genePositions(id)=i;
0092         end
0093     end
0094     %Print the progress
0095     if rem(i-1,350000) == 0
0096         progress=num2str(floor(100*i/numel(elementPositions)));
0097         progress=pad(progress,3,'left');
0098         fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress);
0099     end
0100 end
0101 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n');   
0102 
0103 fprintf('Generating the KEGG Orthology specific multi-FASTA files...   0%% complete');
0104 %Loop through the reactions and print the corresponding sequences
0105 for i=1:numel(model.rxns)
0106 
0107     %Do not overwrite existing files
0108     if ~exist(fullfile(outputDir,[model.rxns{i} '.fa']), 'file')
0109         
0110         %Get the positions in elementPositions for the involved genes
0111         genesUsed=model.rxnGeneMat(i,:);
0112         
0113         %Open a file for this reaction. This saves empty files for KOs
0114         %without genes
0115         rxnfid=fopen(fullfile(outputDir,[model.rxns{i} '.fa']),'w');
0116         
0117         if any(genesUsed)
0118             positions=genePositions(genesUsed~=0);
0119             
0120             %It could be that some genes were not found. Delete those
0121             %elements
0122             positions(positions==0)=[];
0123             
0124             %Print each sequence
0125             for j=1:numel(positions)
0126                 fseek(fid,elementPositions(positions(j)),-1);
0127                 %Should check that it ends with a gene!!!! Check for eof
0128                 if positions(j)<numel(elementPositions)
0129                     str=fread(fid,[1 double(elementPositions(positions(j)+1))-double(elementPositions(positions(j)))-1],'*char');
0130                     
0131                     %If the string does not end with a line feed character
0132                     if str(end)~=10
0133                         str=[str fread(fid,[1 double(elementPositions(positions(j)+2))-double(elementPositions(positions(j)+1))],'*char')];
0134                         
0135                         %This is if we still have not found a new gene.
0136                         %Maximal unluck. This whole check should be done
0137                         %when elementPositions are calculated!
0138                         if str(end)~=10
0139                             %Skip this gene
0140                             continue;
0141                         end
0142                     end
0143                 else
0144                     str=fread(fid,[1 inf],'*char');
0145                 end
0146                 fwrite(rxnfid,['>' str]);
0147             end
0148         end
0149         fclose(rxnfid);
0150     end
0151     %Print the progress
0152     if rem(i-1,50) == 0
0153         progress=num2str(floor(100*i/numel(model.rxns)));
0154         progress=pad(progress,3,'left');
0155         fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress);
0156     end
0157 end
0158 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n');
0159 
0160 %Close the source file
0161 fclose(fid);
0162 end

Generated by m2html © 2005