Home > src > geckomat > get_enzyme_data > getECfromDatabase.m

getECfromDatabase

PURPOSE ^

getECfromDatabase

SYNOPSIS ^

function model = getECfromDatabase(model, ecRxns, action, modelAdapter)

DESCRIPTION ^

 getECfromDatabase
   Populates the model.ec.eccodes field with enzyme codes that are
   extracted from UniProt and KEGG databases, as assigned to the proteins
   that catalyze the specific reactions.

 Input:
   model           an ecModel in GECKO 3 format (with ecModel.ec structure)
   ecRxns          logical of length model.ec.rxns that specifies which
                   model.ec.eccodes entries should be queried. Exiting
                   values in model.ec.eccodes will be wiped. Entries that
                   are indicated by false will be kept and not modified by
                   this function (optional, by default all model.ec.eccodes
                   entries are populated by this function)
   action          response action if multiple proteins with different EC
                   numbers are found for a given gene in a metabolic
                   reaction (optional, default 'display')
                   - 'display' displays all found multiplicities
                   - 'ignore'  ignore multiplicities and use the protein
                               with the lowest index in the database.
                   - 'add'     adds all the multiple proteins as
                               isozymes for the given reaction
   modelAdapter    a loaded model adapter (Optional, will otherwise use the
                   default model adapter).

 Output:
   model           ecModel with populated model.ec.eccodes

 Usage:
   model = getECfromDatabase(model, ecRxns, action, modelAdapter)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function model = getECfromDatabase(model, ecRxns, action, modelAdapter)
0002 % getECfromDatabase
0003 %   Populates the model.ec.eccodes field with enzyme codes that are
0004 %   extracted from UniProt and KEGG databases, as assigned to the proteins
0005 %   that catalyze the specific reactions.
0006 %
0007 % Input:
0008 %   model           an ecModel in GECKO 3 format (with ecModel.ec structure)
0009 %   ecRxns          logical of length model.ec.rxns that specifies which
0010 %                   model.ec.eccodes entries should be queried. Exiting
0011 %                   values in model.ec.eccodes will be wiped. Entries that
0012 %                   are indicated by false will be kept and not modified by
0013 %                   this function (optional, by default all model.ec.eccodes
0014 %                   entries are populated by this function)
0015 %   action          response action if multiple proteins with different EC
0016 %                   numbers are found for a given gene in a metabolic
0017 %                   reaction (optional, default 'display')
0018 %                   - 'display' displays all found multiplicities
0019 %                   - 'ignore'  ignore multiplicities and use the protein
0020 %                               with the lowest index in the database.
0021 %                   - 'add'     adds all the multiple proteins as
0022 %                               isozymes for the given reaction
0023 %   modelAdapter    a loaded model adapter (Optional, will otherwise use the
0024 %                   default model adapter).
0025 %
0026 % Output:
0027 %   model           ecModel with populated model.ec.eccodes
0028 %
0029 % Usage:
0030 %   model = getECfromDatabase(model, ecRxns, action, modelAdapter)
0031 
0032 if nargin < 2 || isempty(ecRxns)
0033     ecRnxs = true(numel(model.ec.rxns),1);
0034 end
0035 
0036 if nargin < 3 || isempty(action)
0037     action = 'display';
0038 end
0039 
0040 if nargin < 4 || isempty(modelAdapter)
0041     modelAdapter = ModelAdapterManager.getDefault();
0042     if isempty(modelAdapter)
0043         error('Either send in a modelAdapter or set the default model adapter in the ModelAdapterManager.')
0044     end
0045 end
0046 params = modelAdapter.getParameters();
0047 
0048 rxnEnzMat = model.ec.rxnEnzMat;
0049 genes = modelAdapter.getUniprotCompatibleGenes(model.ec.genes);
0050 
0051 data    = loadDatabases('both', modelAdapter);
0052 uniprot = data.uniprot;
0053 kegg    = data.kegg;
0054 
0055 modelGenes = modelAdapter.getUniprotIDsFromTable(model.genes);
0056 DBgenesUniprot  = data.uniprot.genes;
0057 if ~isequal(modelGenes,model.genes)
0058     [Lia,Locb] = ismember(modelGenes,uniprot.ID);
0059     DBgenesUniprot(Locb(Lia)) = model.genes(Lia);
0060     keepEntry = unique(Locb(Lia));
0061     DBgenesUniprot = DBgenesUniprot(keepEntry);
0062 else
0063     keepEntry = true(numel(DBgenesUniprot),1);
0064 end
0065 DBecNumUniprot  = data.uniprot.eccodes(keepEntry);
0066 DBMWUniprot     = data.uniprot.MW(keepEntry);
0067 %Build an index from gene to prot for faster processing later
0068 [geneIndexUniprot,geneHashMapUniprot] = hashGeneToProt(DBgenesUniprot);
0069 
0070 if ~isempty(kegg)
0071     DBgenesKEGG     = data.kegg.genes;
0072     DBecNumKEGG     = data.kegg.eccodes;
0073     DBMWKEGG        = data.kegg.MW;
0074     [geneIndexKEGG,geneHashMapKEGG]       = hashGeneToProt(DBgenesKEGG);
0075 end
0076 n = size(rxnEnzMat,1);
0077 
0078 eccodes   = cell(n,1);
0079 eccodes(:)= {''};
0080 conflicts = cell(1,4);
0081 
0082 rxnEnzMat = logical(rxnEnzMat);
0083 progressbar('Assigning EC numbers from database')
0084 for i = 1:n
0085     gns = genes(rxnEnzMat(i,:).');
0086     if ~isempty(gns)
0087         %Find match in Uniprot:
0088         [new_EC,multGenes] = findECInDB(gns,DBecNumUniprot,DBMWUniprot,geneIndexUniprot,geneHashMapUniprot);
0089         if ~isempty(new_EC)
0090             DBase    = 'uniprot';
0091             if ~isempty(multGenes{1})
0092                 multGenes{3} = DBase;
0093             end
0094         end
0095         if ~isempty(kegg) && (isempty(new_EC) || endsWith(new_EC,'-'))
0096             %Find match in KEGG
0097             [new_EC_kegg,multGenes] = findECInDB(gns,DBecNumKEGG,DBMWKEGG,geneIndexKEGG,geneHashMapKEGG);
0098             if ~isempty(new_EC_kegg)
0099                 DBase    = 'kegg';
0100                 if ~isempty(multGenes{1})
0101                     multGenes{3} = DBase;
0102                 end
0103                 new_EC=new_EC_kegg;
0104             end
0105         end
0106         eccodes{i} = new_EC;
0107 
0108         if ~isempty(multGenes{1})
0109             %Rxn index
0110             conflicts{1} = [conflicts{1};i];
0111             %Gene IDs
0112             conflicts{2} = [conflicts{2};multGenes{1}];
0113             %Indexes in DB
0114             conflicts{3} = [conflicts{3};multGenes{2}];
0115             %DB name
0116             conflicts{4} = [conflicts{4};{multGenes{3}}];
0117 
0118             %{ I don't understand the purpose of this, let's skip it for now
0119             %if strcmpi(action,'add')
0120             %    if strcmpi(DBase,'swissprot')
0121             %        [uni,EC,MW,Genes] = addMultipleMatches(uni,EC,MW,Genes,multGenes,swissprot);
0122             %    elseif strcmpi(DBase,'KEGG')
0123             %        [uni,EC,MW,Genes] = addMultipleMatches(uni,EC,MW,Genes,multGenes,kegg);
0124             %    end
0125             %end
0126             %}
0127         end
0128     end
0129     progressbar(i/n)
0130 end
0131 
0132 %Display error message with the multiple gene-protein matches found
0133 if strcmpi(action,'display') && ~isempty(conflicts{1})
0134     displayErrorMessage(conflicts,uniprot,kegg)
0135 end
0136 
0137 if nargin < 2 || isempty(ecRxns) || all(ecRxns)
0138     model.ec.eccodes = eccodes;
0139 else
0140     if ~isfield(model.ec,'eccodes')
0141         model.ec.eccodes(1:numel(model.ec.rxns),1) = {''};
0142     end
0143     %Probably faster to subset with ecRxns in the beginning of the script,
0144     %but this was at the moment simpler to implement.
0145     model.ec.eccodes(ecRxns) = eccodes(ecRxns);
0146 end
0147 
0148 function displayErrorMessage(conflicts,uniprot,kegg)
0149 STR = ['\n ' num2str(length(conflicts{1})) ' genes with multiple associated proteins were found, please'];
0150 STR = [STR, ' revise case by case in the uniprot and kegg files:\n\n'];
0151 for i=1:length(conflicts{1})
0152     if strcmpi(conflicts{4}{i},'uniprot')
0153         DB = uniprot.ID;
0154     else
0155         DB = kegg.uniprot;
0156     end
0157     proteins = DB(conflicts{3}{i});
0158     STR = [STR, '- gene: ' conflicts{2}{i} '  Proteins: ' strjoin(proteins) '\n'];
0159 end
0160 STR = [STR, '\nIf a wrongly annotated case was found then call the '];
0161 STR = [STR, 'getECfromDatabase.m function again with the option action'];
0162 STR = [STR, '= ignore\n\n'];
0163 STR = [STR, 'If the conflicting proteins are desired to be kept as isozymes'];
0164 STR = [STR, ' then call the getECfromDatabase.m function'];
0165 STR = [STR, ' again with the option action = add\n'];
0166 error(sprintf(STR))
0167 end
0168 
0169 function [geneIndex,geneHashMap]=hashGeneToProt(proteinDB)
0170 
0171 [x,y] = size(proteinDB);
0172 genesForIndex = reshape(proteinDB, x*y, 1);
0173 genesForIndex = genesForIndex(~cellfun(@isempty, genesForIndex));
0174 genesForIndex = unique(genesForIndex);
0175 geneIndex = cell(length(genesForIndex),1);
0176 geneHashMap = containers.Map(genesForIndex,1:length(genesForIndex));
0177 protIndices = 1:length(proteinDB(:,1));
0178 for i = 1:y
0179     tmp1 = proteinDB(:,i);
0180     sel = ~cellfun(@isempty, tmp1);
0181     indices = cell2mat(values(geneHashMap,tmp1(sel)));
0182     protIndicesSel = protIndices(sel);
0183     for j = 1:length(indices)
0184         geneIndex{indices(j)} = [geneIndex{indices(j)};protIndicesSel(j)];
0185     end
0186 end
0187 end
0188 end

Generated by m2html © 2005