Home > external > metacyc > getMetaCycModelForOrganism.m

getMetaCycModelForOrganism

PURPOSE ^

getMetaCycModelForOrganism

SYNOPSIS ^

function [model, metacycBlastStruct] = getMetaCycModelForOrganism(organismID,fastaFile,keepTransportRxns,keepUnbalanced,keepUndetermined,minScore,minPositives,useDiamond,metacycBlastStruct)

DESCRIPTION ^

 getMetaCycModelForOrganism
   Reconstructs a genome-scale metabolic model based on protein homology to the
   MetaCyc pathway database

   Input:
   organismID          the query organism's abbreviation, which is defined
                       by user
   fastaFile           a FASTA file that contains the protein sequences of
                       the organism for which to reconstruct a model
   keepTransportRxns   include transportation reactions, which often have identical
                       reactants and products that turn to be all-zero columns in
                       the S matrix (opt, default false)
   keepUnbalanced      include reactions cannot be unbalanced reactions, usually
                       because they are polymeric reactions or because of a
                       specific difficulty in balancing class structures
                       (opt, default false)
   keepUndetermined    include reactions that have substrates lack chemical
                       structures or with non-numerical coefficients (e.g. n+1)
                       (opt, default false)
   minScore            minimum Bit scores of BLASTp search (opt, default 100)
   minPositives        minimum Positives values of BLASTp search (opt, default 45 %)
   useDiamond          use DIAMOND alignment tools to perform homology search
                       if true, otherwise the BLASTP is used (opt, default true)
   metacycBlastStruct  provided an earlier generated metacycBlastStruct
                       from getMetaCycModelForOrganism (opt, default empty).
                       Useful when trying different cutoffs for minScore
                       and minPositives without having to regenerate the
                       blastStructure each time.

   Output:
   model               a model structure for the query organism
   metacycBlastStruct  result from getBlast or getDiamond, before the
                       minScore and minPositives cutoffs are applied

   Usage: [model, metacycBlastStruct] = getMetaCycModelForOrganism(organismID,fastaFile,...
    keepTransportRxns,keepUnbalanced,keepUndetermined,minScore,minPositives,useDiamond,metacycBlastStruct)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [model, metacycBlastStruct] = getMetaCycModelForOrganism(organismID,fastaFile,...
0002     keepTransportRxns,keepUnbalanced,keepUndetermined,minScore,minPositives,useDiamond,metacycBlastStruct)
0003 % getMetaCycModelForOrganism
0004 %   Reconstructs a genome-scale metabolic model based on protein homology to the
0005 %   MetaCyc pathway database
0006 %
0007 %   Input:
0008 %   organismID          the query organism's abbreviation, which is defined
0009 %                       by user
0010 %   fastaFile           a FASTA file that contains the protein sequences of
0011 %                       the organism for which to reconstruct a model
0012 %   keepTransportRxns   include transportation reactions, which often have identical
0013 %                       reactants and products that turn to be all-zero columns in
0014 %                       the S matrix (opt, default false)
0015 %   keepUnbalanced      include reactions cannot be unbalanced reactions, usually
0016 %                       because they are polymeric reactions or because of a
0017 %                       specific difficulty in balancing class structures
0018 %                       (opt, default false)
0019 %   keepUndetermined    include reactions that have substrates lack chemical
0020 %                       structures or with non-numerical coefficients (e.g. n+1)
0021 %                       (opt, default false)
0022 %   minScore            minimum Bit scores of BLASTp search (opt, default 100)
0023 %   minPositives        minimum Positives values of BLASTp search (opt, default 45 %)
0024 %   useDiamond          use DIAMOND alignment tools to perform homology search
0025 %                       if true, otherwise the BLASTP is used (opt, default true)
0026 %   metacycBlastStruct  provided an earlier generated metacycBlastStruct
0027 %                       from getMetaCycModelForOrganism (opt, default empty).
0028 %                       Useful when trying different cutoffs for minScore
0029 %                       and minPositives without having to regenerate the
0030 %                       blastStructure each time.
0031 %
0032 %   Output:
0033 %   model               a model structure for the query organism
0034 %   metacycBlastStruct  result from getBlast or getDiamond, before the
0035 %                       minScore and minPositives cutoffs are applied
0036 %
0037 %   Usage: [model, metacycBlastStruct] = getMetaCycModelForOrganism(organismID,fastaFile,...
0038 %    keepTransportRxns,keepUnbalanced,keepUndetermined,minScore,minPositives,useDiamond,metacycBlastStruct)
0039 
0040 organismID=char(organismID);
0041 if nargin<2
0042     EM='No query protein fasta file is specified';
0043     dispEM(EM);
0044 else
0045     fastaFile=char(fastaFile);
0046 end
0047 if nargin<3
0048     keepTransportRxns=false;
0049 end
0050 if nargin<4
0051     keepUnbalanced=false;
0052 end
0053 if nargin<5
0054     keepUndetermined=false;
0055 end
0056 if nargin<6
0057     minScore=100;
0058 end
0059 if nargin<7
0060     minPositives=45;
0061 end
0062 if nargin<8
0063     useDiamond=true;
0064 end
0065 if nargin<9
0066     metacycBlastStruct=[];
0067 end
0068 
0069 if isempty(fastaFile)
0070     error('*** The query FASTA filename cannot be empty! ***');
0071 else
0072     fprintf('\nChecking existence of query FASTA file... ');
0073     %Check if query fasta exists
0074     fastaFile=checkFileExistence(fastaFile,2); %Copy file to temp dir
0075     fprintf('done\n');
0076 end
0077 
0078 %First generate the full MetaCyc model
0079 metaCycModel=getModelFromMetaCyc([],keepTransportRxns,keepUnbalanced,keepUndetermined);
0080 fprintf('The full MetaCyc model loaded\n');
0081 
0082 %Create the draft model from MetaCyc super model model=metaCycModel;
0083 model.id=organismID;
0084 model.name='Generated by homology with MetaCyc database';
0085 model.rxns=metaCycModel.rxns;
0086 model.rxnNames=metaCycModel.rxnNames;
0087 model.eccodes=metaCycModel.eccodes;
0088 model.subSystems=metaCycModel.subSystems;
0089 model.rxnMiriams=metaCycModel.rxnMiriams;
0090 model.rxnReferences=metaCycModel.rxnReferences;
0091 model.lb=metaCycModel.lb;
0092 model.ub=metaCycModel.ub;
0093 model.rev=metaCycModel.rev;
0094 model.c=metaCycModel.c;
0095 model.equations=metaCycModel.equations;
0096 
0097 %Get the root directory for RAVEN Toolbox.
0098 ravenPath=findRAVENroot();
0099 
0100 %Generate blast strcture by either DIAMOND or BLASTP
0101 if isempty(metacycBlastStruct)
0102     if useDiamond
0103         blastStruc=getDiamond(organismID,fastaFile,{'MetaCyc'},fullfile(ravenPath,'external','metacyc','protseq.fsa'));
0104     else
0105         blastStruc=getBlast(organismID,fastaFile,{'MetaCyc'},fullfile(ravenPath,'external','metacyc','protseq.fsa'));
0106     end
0107     %Only look the query
0108     blastStructure=blastStruc(2);
0109     metacycBlastStruct=blastStructure;
0110 else
0111     blastStructure=metacycBlastStruct;
0112 end
0113 
0114 %Remove all blast hits that are below the cutoffs
0115 indexes=blastStructure.bitscore>=minScore & blastStructure.ppos>=minPositives;
0116 blastStructure.toGenes(~indexes)=[];
0117 blastStructure.fromGenes(~indexes)=[];
0118 blastStructure.evalue(~indexes)=[];
0119 blastStructure.aligLen(~indexes)=[];
0120 blastStructure.identity(~indexes)=[];
0121 blastStructure.bitscore(~indexes)=[];
0122 blastStructure.ppos(~indexes)=[];
0123 fprintf('Completed searching against MetaCyc protein sequences.\n');
0124 
0125 % Get the qualified genes of query organism from blast structure
0126 model.genes=cell(100000,1);
0127 model.proteins=cell(100000,1);
0128 model.bitscore=zeros(100000,1);
0129 model.ppos=zeros(100000,1);
0130 num=1;
0131 
0132 %Go through the strucutre and find out the hit with the best bit score
0133 for i=1:numel(blastStructure.fromGenes)
0134     if num==1
0135         model.genes(num)=blastStructure.fromGenes(i);
0136         model.proteins(num)=strrep(blastStructure.toGenes(i), 'gnl|META|', '');
0137         model.bitscore(num)=blastStructure.bitscore(i);
0138         model.ppos(num)=blastStructure.ppos(i);
0139         num=num+1;
0140         lastGene=blastStructure.fromGenes(i);
0141     else
0142         if ~isequal(lastGene,blastStructure.fromGenes(i))
0143             model.genes(num)=blastStructure.fromGenes(i);
0144             model.proteins(num)=strrep(blastStructure.toGenes(i), 'gnl|META|', '');
0145             model.bitscore(num)=blastStructure.bitscore(i);
0146             model.ppos(num)=blastStructure.ppos(i);
0147             num=num+1;
0148             lastGene=blastStructure.fromGenes(i);
0149         else
0150             if model.bitscore(num)<blastStructure.bitscore(i)
0151                 model.bitscore(num)=blastStructure.bitscore(i);
0152                 model.proteins(num)=strrep(blastStructure.toGenes(i), 'gnl|META|', '');
0153                 model.ppos(num)=blastStructure.ppos(i);
0154             end
0155         end
0156     end
0157 end
0158 model.genes=model.genes(1:num-1);
0159 model.proteins=model.proteins(1:num-1);
0160 model.bitscore=model.bitscore(1:num-1);
0161 model.ppos=model.ppos(1:num-1);
0162 
0163 % Get the indexes of matched genes in the metaCycModel
0164 % because some enzymes in the FASTA file cannot be found in the dump file
0165 [hits, indexes]=ismember(model.proteins,metaCycModel.genes);
0166 found = find(hits);
0167 model.genes=model.genes(found);
0168 
0169 % Restructure the rxnGeneMat matrix
0170 model.rxnGeneMat=sparse(metaCycModel.rxnGeneMat(:,indexes(found)));
0171 
0172 %Remove all reactions without genes
0173 hasGenes=any(model.rxnGeneMat,2);
0174 model=removeReactions(model,~hasGenes,true);
0175 
0176 %Generate grRules, only consider the or relationship here Matched enzymes
0177 %are stored in field rxnScores,
0178 rxnNum=numel(model.rxns);
0179 model.rxnConfidenceScores=NaN(rxnNum,1);
0180 model.rxnConfidenceScores(:)=2;
0181 model.grRules=cell(rxnNum,1);
0182 %model.rxnScores=cell(rxnNum,1);
0183 for j=1:rxnNum
0184     model.grRules{j}='';
0185     I=find(model.rxnGeneMat(j,:));
0186     for k=1:numel(I)
0187         if isempty(model.grRules{j})
0188             model.grRules(j)=model.genes(I(k));
0189             %model.rxnScores(j)=model.proteins(I(k));
0190         else
0191             model.grRules(j)=strcat(model.grRules(j),{' or '},model.genes(I(k)));
0192             %model.rxnScores(j)=strcat(model.rxnScores(j),{' or
0193             %'},model.proteins(I(k)));
0194         end
0195     end
0196 end
0197 %update genes field
0198 model.genes=model.genes(any(model.rxnGeneMat));
0199 
0200 %Construct the S matrix and list of metabolites
0201 [S, mets, badRxns]=constructS(model.equations);
0202 model.S=S;
0203 model.mets=mets;
0204 
0205 %model=removeReactions(model,badRxns,true,true);
0206 
0207 %Then match up metabolites
0208 metaCycMets=getMetsFromMetaCyc([]);
0209 
0210 %Add information about all metabolites to the model
0211 [a, b]=ismember(model.mets,metaCycMets.mets);
0212 a=find(a);
0213 b=b(a);
0214 
0215 if ~isfield(model,'metNames')
0216     model.metNames=cell(numel(model.mets),1);
0217     model.metNames(:)={''};
0218 end
0219 model.metNames(a)=metaCycMets.metNames(b);
0220 
0221 if ~isfield(model,'metFormulas')
0222     model.metFormulas=cell(numel(model.mets),1);
0223     model.metFormulas(:)={''};
0224 end
0225 model.metFormulas(a)=metaCycMets.metFormulas(b);
0226 
0227 if ~isfield(model,'metCharges')
0228     model.metCharges=zeros(numel(model.mets),1);
0229 end
0230 model.metCharges(a)=metaCycMets.metCharges(b);
0231 
0232 if ~isfield(model,'b')
0233     model.b=zeros(numel(model.mets),1);
0234 end
0235 %model.b(a)=metaCycMets.b(b);
0236 
0237 if ~isfield(model,'inchis')
0238     model.inchis=cell(numel(model.mets),1);
0239     model.inchis(:)={''};
0240 end
0241 model.inchis(a)=metaCycMets.inchis(b);
0242 
0243 if ~isfield(model,'metMiriams')
0244     model.metMiriams=cell(numel(model.mets),1);
0245 end
0246 model.metMiriams(a)=metaCycMets.metMiriams(b);
0247 
0248 %Put all metabolites in one compartment called 's' (for system). This is
0249 %done just to be more compatible with the rest of the code
0250 model.comps={'s'};
0251 model.compNames={'System'};
0252 model.metComps=ones(numel(model.mets),1);
0253 
0254 %It could also be that the metabolite names are empty for some reason In
0255 %that case, use the ID instead
0256 I=cellfun(@isempty,model.metNames);
0257 model.metNames(I)=model.mets(I);
0258 
0259 %Remove additional fields
0260 model=rmfield(model,{'proteins','bitscore','ppos'});
0261 
0262 %In the end fix grRules and rxnGeneMat
0263 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Get detailed output
0264 model.grRules = grRules;
0265 model.rxnGeneMat = rxnGeneMat;
0266 %Remove the temp fasta file
0267 delete(fastaFile)
0268 end

Generated by m2html © 2005