Home > external > kegg > getModelFromKEGG.m

getModelFromKEGG

PURPOSE ^

getModelFromKEGG

SYNOPSIS ^

function [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,keepUndefinedStoich,keepIncomplete,keepGeneral)

DESCRIPTION ^

 getModelFromKEGG
   Retrieves information stored in KEGG database and generates a model

   Input:
   keggPath            if keggGenes.mat, keggMets.mat, keggPhylDist.mat or
                       keggRxns.mat are not in the RAVEN/external/kegg
                       directory, this function will attempt to read data
                       from a local FTP dump of the KEGG database.
                       keggPath is the path to the root of this database
                       (optional, default 'RAVEN/external/kegg'). If
                       keggModel.mat is present in the same directory, the
                       function reads the data from this file and ignores
                       keggGenes.mat, keggMets.mat and keggRxns.mat
   keepSpontaneous     include reactions labeled as "spontaneous" (optional,
                       default true)
   keepUndefinedStoich include reactions in the form n A <=> n+1 A. These
                       will be dealt with as two separate metabolites
                       (optional, default true)
   keepIncomplete      include reactions which have been labelled as
                       "incomplete", "erroneous" or "unclear" (optional,
                       default true)
   keepGeneral         include reactions which have been labelled as
                       "general reaction". These are reactions on the form
                       "an aldehyde <=> an alcohol", and are therefore
                       unsuited for modelling purposes. Note that not all
                       reactions have this type of annotation, and the
                       script will therefore not be able to remove all
                       such reactions (optional, default false)

   Output:
   model               a model structure generated from the database. All
                       reactions and the metabolites used in them will be
                       added
   KOModel             a model structure representing the KEGG Orthology
                       ids and their associated genes. The KO ids are
                       saved as reactions

   NOTE: The model output from getModelFromKEGG can be used as template
   for fillGaps. In that case, ensure that the genes and rxnGeneMat fields
   are removed before parsing: model=rmfield(model,'genes'), etc.

 Usage: [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,...
    keepUndefinedStoich,keepIncomplete,keepGeneral)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,...
0002     keepUndefinedStoich,keepIncomplete,keepGeneral)
0003 % getModelFromKEGG
0004 %   Retrieves information stored in KEGG database and generates a model
0005 %
0006 %   Input:
0007 %   keggPath            if keggGenes.mat, keggMets.mat, keggPhylDist.mat or
0008 %                       keggRxns.mat are not in the RAVEN/external/kegg
0009 %                       directory, this function will attempt to read data
0010 %                       from a local FTP dump of the KEGG database.
0011 %                       keggPath is the path to the root of this database
0012 %                       (optional, default 'RAVEN/external/kegg'). If
0013 %                       keggModel.mat is present in the same directory, the
0014 %                       function reads the data from this file and ignores
0015 %                       keggGenes.mat, keggMets.mat and keggRxns.mat
0016 %   keepSpontaneous     include reactions labeled as "spontaneous" (optional,
0017 %                       default true)
0018 %   keepUndefinedStoich include reactions in the form n A <=> n+1 A. These
0019 %                       will be dealt with as two separate metabolites
0020 %                       (optional, default true)
0021 %   keepIncomplete      include reactions which have been labelled as
0022 %                       "incomplete", "erroneous" or "unclear" (optional,
0023 %                       default true)
0024 %   keepGeneral         include reactions which have been labelled as
0025 %                       "general reaction". These are reactions on the form
0026 %                       "an aldehyde <=> an alcohol", and are therefore
0027 %                       unsuited for modelling purposes. Note that not all
0028 %                       reactions have this type of annotation, and the
0029 %                       script will therefore not be able to remove all
0030 %                       such reactions (optional, default false)
0031 %
0032 %   Output:
0033 %   model               a model structure generated from the database. All
0034 %                       reactions and the metabolites used in them will be
0035 %                       added
0036 %   KOModel             a model structure representing the KEGG Orthology
0037 %                       ids and their associated genes. The KO ids are
0038 %                       saved as reactions
0039 %
0040 %   NOTE: The model output from getModelFromKEGG can be used as template
0041 %   for fillGaps. In that case, ensure that the genes and rxnGeneMat fields
0042 %   are removed before parsing: model=rmfield(model,'genes'), etc.
0043 %
0044 % Usage: [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,...
0045 %    keepUndefinedStoich,keepIncomplete,keepGeneral)
0046 
0047 ravenPath=findRAVENroot();
0048 
0049 if nargin<1
0050     keggPath=fullfile(ravenPath,'external','kegg');
0051 else
0052     keggPath=char(keggPath);
0053 end
0054 if nargin<2
0055     keepSpontaneous=true;
0056 end
0057 if nargin<3
0058     keepUndefinedStoich=true;
0059 end
0060 if nargin<4
0061     keepIncomplete=true;
0062 end
0063 if nargin<5
0064     keepGeneral=false;
0065 end
0066 
0067 modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat');
0068 if exist(modelFile, 'file') && isNewestFile(ravenPath)
0069     fprintf(['Importing the global KEGG model from ' strrep(modelFile,'\','/') '... ']);
0070     load(modelFile);
0071     fprintf('COMPLETE\n');
0072 else
0073     fprintf(['NOTE: The file ' strrep(modelFile,'\','/') ' does not exist or is out-of-date and therefore will be (re)generated\n']);
0074     %First get all reactions
0075     [model,isSpontaneous,isUndefinedStoich,isIncomplete,isGeneral]=getRxnsFromKEGG(keggPath);
0076     
0077     %Get the KO ids that are associated with any of the reactions. They
0078     %will be used later on to create a rxn-gene matrix
0079     KOs=cell(numel(model.rxns)*2,1);
0080     %Make room for two KO ids per reaction
0081     
0082     addToIndex=1;
0083     %Add all KO ids that are used
0084     for i=1:numel(model.rxns)
0085         if isstruct(model.rxnMiriams{i})
0086             for j=1:numel(model.rxnMiriams{i}.name)
0087                 if strcmpi('kegg.orthology',model.rxnMiriams{i}.name{j})
0088                     %Add the KO id
0089                     KOs(addToIndex)=model.rxnMiriams{i}.value(j);
0090                     addToIndex=addToIndex+1;
0091                 end
0092             end
0093         end
0094     end
0095     
0096     KOs=KOs(1:addToIndex-1);
0097     KOs=unique(KOs);
0098     
0099     %Get all genes from any organism in KEGG that is associated with any of
0100     %the KOs
0101     KOModel=getGenesFromKEGG(keggPath,KOs);
0102     
0103     fprintf('Pruning the global KEGG model from the partially annotated, lumped KEGG Orthology entries... ')
0104     model.genes=KOModel.genes;
0105     
0106     %It can be that there are KOs from the reactions that have no database
0107     %entry. These are (as far as I have seen) lumped versions of other KOs
0108     %and should be removed
0109     KOsToRemove=setdiff(KOs, KOModel.rxns);
0110     
0111     %Loop through all reactions and delete the KOs that were not found
0112     for i=1:numel(model.rxns)
0113         if isstruct(model.rxnMiriams{i})
0114             for j=1:numel(model.rxnMiriams{i}.name)
0115                 toDel=[];
0116                 if strcmp(model.rxnMiriams{i}.name{j},'kegg.orthology')
0117                     if ismember(model.rxnMiriams{i}.value{j},KOsToRemove)
0118                         toDel=[toDel;j];
0119                     end
0120                 end
0121             end
0122             %Delete the KOs
0123             if any(toDel)
0124                 %If all posts are deleted, then delete the whole structure
0125                 if numel(toDel)==j
0126                     model.rxnMiriams{i}=[];
0127                 else
0128                     model.rxnMiriams{i}.name(toDel)=[];
0129                     model.rxnMiriams{i}.value(toDel)=[];
0130                 end
0131             end
0132         end
0133     end
0134     fprintf('COMPLETE\n');
0135     
0136     fprintf('Constructing the rxnGeneMat for the global KEGG model...   0%% complete');
0137     %Create the rxnGeneMat for the reactions. This is simply done by
0138     %merging the gene associations for all the involved KOs
0139     r=zeros(10000000,1);
0140     %Store the positions since it's slow to write to a sparse array in a
0141     %loop
0142     c=zeros(10000000,1);
0143     counter=1;
0144     for i=1:numel(model.rxns)
0145         if isstruct(model.rxnMiriams{i})
0146             I=strncmp('kegg.orthology',model.rxnMiriams{i}.name,18);
0147             if any(I)
0148                 [J, K]=ismember(model.rxnMiriams{i}.value(I),KOModel.rxns);
0149                 %Find all gene indexes that correspond to any of these KOs
0150                 [~, L]=find(KOModel.rxnGeneMat(K(J),:));
0151                 if any(L)
0152                     %Allocate room for more elements if needed
0153                     if counter+numel(L)-1>=numel(r)
0154                         r=[r;zeros(numel(r),1)];
0155                         c=[c;zeros(numel(c),1)];
0156                     end
0157                     r(counter:counter+numel(L)-1)=ones(numel(L),1)*i;
0158                     c(counter:counter+numel(L)-1)=L(:);
0159                     counter=counter+numel(L);
0160                 end
0161             end
0162         end
0163         if rem(i-1,100) == 0
0164             progress=pad(num2str(floor(i/numel(model.rxns)*100)),3,'left');
0165             fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress);
0166         end
0167     end
0168     
0169     model.rxnGeneMat=sparse(r(1:counter-1),c(1:counter-1),ones(counter-1,1));
0170     if size(model.rxnGeneMat,1)~=numel(model.rxns) || size(model.rxnGeneMat,2)~=numel(KOModel.genes)
0171         model.rxnGeneMat(numel(model.rxns),numel(KOModel.genes))=0;
0172     end
0173     fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n');
0174     
0175     %Then get all metabolites
0176     metModel=getMetsFromKEGG(keggPath);
0177     
0178     fprintf('Finalizing the global KEGG model... ');
0179     %Add information about all metabolites to the model
0180     [a, b]=ismember(model.mets,metModel.mets);
0181     a=find(a);
0182     b=b(a);
0183     
0184     if ~isfield(model,'metNames')
0185         model.metNames=cell(numel(model.mets),1);
0186         model.metNames(:)={''};
0187     end
0188     model.metNames(a)=metModel.metNames(b);
0189     
0190     if ~isfield(model,'metFormulas')
0191         model.metFormulas=cell(numel(model.mets),1);
0192         model.metFormulas(:)={''};
0193     end
0194     model.metFormulas(a)=metModel.metFormulas(b);
0195     
0196     if ~isfield(model,'inchis')
0197         model.inchis=cell(numel(model.mets),1);
0198         model.inchis(:)={''};
0199     end
0200     model.inchis(a)=metModel.inchis(b);
0201     
0202     if ~isfield(model,'metMiriams')
0203         model.metMiriams=cell(numel(model.mets),1);
0204     end
0205     model.metMiriams(a)=metModel.metMiriams(b);
0206     
0207     %The composition should be loaded from InChIs when available
0208     I=find(~cellfun(@isempty,model.inchis));
0209     for i=1:numel(I)
0210         S=regexp(model.inchis(I(i)),'/','split');
0211         S=S{1};
0212         if numel(S)>=2
0213             %Don't copy if it doesn't look good
0214             model.metFormulas(I(i))=S(2);
0215         end
0216     end
0217     
0218     %Put all metabolites in one compartment called 's' (for system). This
0219     %is done just to be more compatible with the rest of the code
0220     model.comps={'s'};
0221     model.compNames={'System'};
0222     model.metComps=ones(numel(model.mets),1);
0223     
0224     %If reactions with undefined stoichiometry are kept, then the
0225     %corresponding metabolites will have ids such as "(n+1) C00001" and
0226     %their names will be empty. These ids are not valid SBML identifiers
0227     %and are therefore replaced with "undefined1, undefined2...". The
0228     %former ids are stored as the new names
0229     I=find(cellfun(@any,strfind(model.mets,'n')) | cellfun(@any,strfind(model.mets,'m')));
0230     model.metNames(I)=model.mets(I);
0231     repNums=1:numel(I);
0232     repIDs=strcat('undefined_',cellfun(@num2str,num2cell(repNums(:)),'UniformOutput',false));
0233     model.mets(I)=repIDs;
0234     
0235     %It could also be that the metabolite names are empty for some other
0236     %reason. In that case, use the ID instead
0237     I=cellfun(@isempty,model.metNames);
0238     model.metNames(I)=model.mets(I);
0239 
0240     %Deafult LB and UB
0241     model.annotation.defaultLB = -1000;
0242     model.annotation.defaultUB = 1000;
0243     
0244     %Save the model structure
0245     save(modelFile,'model','KOModel','isGeneral','isIncomplete','isUndefinedStoich','isSpontaneous');
0246     fprintf('COMPLETE\n');
0247 end
0248 
0249 %Delete reactions which are labeled as "incomplete", "erroneous",
0250 %"unclear", "general reaction" or having undefined stoichiometry (depending
0251 %on settings)
0252 if keepSpontaneous==false
0253     model=removeReactions(model,intersect(isSpontaneous,model.rxns),true,true);
0254 end
0255 if keepUndefinedStoich==false
0256     model=removeReactions(model,intersect(isUndefinedStoich,model.rxns),true,true);
0257 end
0258 if keepIncomplete==false
0259     model=removeReactions(model,intersect(isIncomplete,model.rxns),true,true);
0260 end
0261 if keepGeneral==false
0262     model=removeReactions(model,intersect(isGeneral,model.rxns),true,true);
0263 end
0264 
0265 end
0266 
0267 function output = isNewestFile(ravenPath)
0268 %The ad hoc function, which checks whether keggModel.mat is the more
0269 %recently modified than keggRxns.mat, keggGenes.mat and keggRxns.mat
0270 modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat');
0271 rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat');
0272 genesFile=fullfile(ravenPath,'external','kegg','keggGenes.mat');
0273 metsFile=fullfile(ravenPath,'external','kegg','keggMets.mat');
0274 if (getFileTime(modelFile)>getFileTime(rxnsFile))&&...
0275         (getFileTime(modelFile)>getFileTime(genesFile))&&...
0276         (getFileTime(modelFile)>getFileTime(metsFile))
0277     output=1;
0278 else
0279     output=0;
0280 end
0281 end
0282 
0283 function modTime = getFileTime(fileName)
0284 %Gets a last modification time for a particular file in datenum format that
0285 %the numbers could be easily compared for different files
0286 listing = dir(fileName);
0287 assert(numel(listing) == 1, 'No such file: %s', fileName);
0288 modTime = listing.datenum;
0289 format long;
0290 end

Generated by m2html © 2005