Description of getECfromDatabase

0001 function model = getECfromDatabase(model, ecRxns, action, modelAdapter)
0002 % getECfromDatabase
0003 %   Populates the model.ec.eccodes field with enzyme codes that are
0004 %   extracted from UniProt and KEGG databases, as assigned to the proteins
0005 %   that catalyze the specific reactions.
0006 %
0007 % Input:
0008 %   model           an ecModel in GECKO 3 format (with ecModel.ec structure)
0009 %   ecRxns          logical of length model.ec.rxns that specifies which
0010 %                   model.ec.eccodes entries should be queried. Exiting
0011 %                   values in model.ec.eccodes will be wiped. Entries that
0012 %                   are indicated by false will be kept and not modified by
0013 %                   this function (optional, by default all model.ec.eccodes
0014 %                   entries are populated by this function)
0015 %   action          response action if multiple proteins with different EC
0016 %                   numbers are found for a given gene in a metabolic
0017 %                   reaction (optional, default 'display')
0018 %                   - 'display' displays all found multiplicities
0019 %                   - 'ignore'  ignore multiplicities and use the protein
0020 %                               with the lowest index in the database.
0021 %                   - 'add'     adds all the multiple proteins as
0022 %                               isozymes for the given reaction
0023 %   modelAdapter    a loaded model adapter (Optional, will otherwise use the
0024 %                   default model adapter).
0025 %
0026 % Output:
0027 %   model           ecModel with populated model.ec.eccodes
0028 %
0029 % Usage:
0030 %   model = getECfromDatabase(model, ecRxns, action, modelAdapter)
0031 
0032 if nargin < 2 || isempty(ecRxns)
0033     ecRnxs = true(numel(model.ec.rxns),1);
0034 end
0035 
0036 if nargin < 3 || isempty(action)
0037     action = 'display';
0038 end
0039 
0040 if nargin < 4 || isempty(modelAdapter)
0041     modelAdapter = ModelAdapterManager.getDefault();
0042     if isempty(modelAdapter)
0043         error('Either send in a modelAdapter or set the default model adapter in the ModelAdapterManager.')
0044     end
0045 end
0046 params = modelAdapter.getParameters();
0047 
0048 rxnEnzMat = model.ec.rxnEnzMat;
0049 genes = modelAdapter.getUniprotCompatibleGenes(model.ec.genes);
0050 
0051 data    = loadDatabases('both', modelAdapter);
0052 uniprot = data.uniprot;
0053 kegg    = data.kegg;
0054 
0055 modelGenes = modelAdapter.getUniprotIDsFromTable(model.genes);
0056 DBgenesUniprot  = data.uniprot.genes;
0057 if ~isequal(modelGenes,model.genes)
0058     [Lia,Locb] = ismember(modelGenes,uniprot.ID);
0059     DBgenesUniprot(Locb(Lia)) = model.genes(Lia);
0060     keepEntry = unique(Locb(Lia));
0061     DBgenesUniprot = DBgenesUniprot(keepEntry);
0062 else
0063     keepEntry = true(numel(DBgenesUniprot),1);
0064 end
0065 DBecNumUniprot  = data.uniprot.eccodes(keepEntry);
0066 DBMWUniprot     = data.uniprot.MW(keepEntry);
0067 %Build an index from gene to prot for faster processing later
0068 [geneIndexUniprot,geneHashMapUniprot] = hashGeneToProt(DBgenesUniprot);
0069 
0070 if ~isempty(kegg)
0071     DBgenesKEGG     = data.kegg.genes;
0072     DBecNumKEGG     = data.kegg.eccodes;
0073     DBMWKEGG        = data.kegg.MW;
0074     [geneIndexKEGG,geneHashMapKEGG]       = hashGeneToProt(DBgenesKEGG);
0075 end
0076 n = size(rxnEnzMat,1);
0077 
0078 eccodes   = cell(n,1);
0079 eccodes(:)= {''};
0080 conflicts = cell(1,4);
0081 
0082 rxnEnzMat = logical(rxnEnzMat);
0083 progressbar('Assigning EC numbers from database')
0084 for i = 1:n
0085     gns = genes(rxnEnzMat(i,:).');
0086     if ~isempty(gns)
0087         %Find match in Uniprot:
0088         [new_EC,multGenes] = findECInDB(gns,DBecNumUniprot,DBMWUniprot,geneIndexUniprot,geneHashMapUniprot);
0089         if ~isempty(new_EC)
0090             DBase    = 'uniprot';
0091             if ~isempty(multGenes{1})
0092                 multGenes{3} = DBase;
0093             end
0094         end
0095         if ~isempty(kegg) && (isempty(new_EC) || endsWith(new_EC,'-'))
0096             %Find match in KEGG
0097             [new_EC_kegg,multGenes] = findECInDB(gns,DBecNumKEGG,DBMWKEGG,geneIndexKEGG,geneHashMapKEGG);
0098             if ~isempty(new_EC_kegg)
0099                 DBase    = 'kegg';
0100                 if ~isempty(multGenes{1})
0101                     multGenes{3} = DBase;
0102                 end
0103                 new_EC=new_EC_kegg;
0104             end
0105         end
0106         eccodes{i} = new_EC;
0107 
0108         if ~isempty(multGenes{1})
0109             %Rxn index
0110             conflicts{1} = [conflicts{1};i];
0111             %Gene IDs
0112             conflicts{2} = [conflicts{2};multGenes{1}];
0113             %Indexes in DB
0114             conflicts{3} = [conflicts{3};multGenes{2}];
0115             %DB name
0116             conflicts{4} = [conflicts{4};{multGenes{3}}];
0117 
0118             %{ I don't understand the purpose of this, let's skip it for now
0119             %if strcmpi(action,'add')
0120             %    if strcmpi(DBase,'swissprot')
0121             %        [uni,EC,MW,Genes] = addMultipleMatches(uni,EC,MW,Genes,multGenes,swissprot);
0122             %    elseif strcmpi(DBase,'KEGG')
0123             %        [uni,EC,MW,Genes] = addMultipleMatches(uni,EC,MW,Genes,multGenes,kegg);
0124             %    end
0125             %end
0126             %}
0127         end
0128     end
0129     progressbar(i/n)
0130 end
0131 
0132 %Display error message with the multiple gene-protein matches found
0133 if strcmpi(action,'display') && ~isempty(conflicts{1})
0134     displayErrorMessage(conflicts,uniprot,kegg)
0135 end
0136 
0137 if nargin < 2 || isempty(ecRxns) || all(ecRxns)
0138     model.ec.eccodes = eccodes;
0139 else
0140     if ~isfield(model.ec,'eccodes')
0141         model.ec.eccodes(1:numel(model.ec.rxns),1) = {''};
0142     end
0143     %Probably faster to subset with ecRxns in the beginning of the script,
0144     %but this was at the moment simpler to implement.
0145     model.ec.eccodes(ecRxns) = eccodes(ecRxns);
0146 end
0147 
0148 function displayErrorMessage(conflicts,uniprot,kegg)
0149 STR = ['\n ' num2str(length(conflicts{1})) ' genes with multiple associated proteins were found, please'];
0150 STR = [STR, ' revise case by case in the uniprot and kegg files:\n\n'];
0151 for i=1:length(conflicts{1})
0152     if strcmpi(conflicts{4}{i},'uniprot')
0153         DB = uniprot.ID;
0154     else
0155         DB = kegg.uniprot;
0156     end
0157     proteins = DB(conflicts{3}{i});
0158     STR = [STR, '- gene: ' conflicts{2}{i} '  Proteins: ' strjoin(proteins) '\n'];
0159 end
0160 STR = [STR, '\nIf a wrongly annotated case was found then call the '];
0161 STR = [STR, 'getECfromDatabase.m function again with the option action'];
0162 STR = [STR, '= ignore\n\n'];
0163 STR = [STR, 'If the conflicting proteins are desired to be kept as isozymes'];
0164 STR = [STR, ' then call the getECfromDatabase.m function'];
0165 STR = [STR, ' again with the option action = add\n'];
0166 error(sprintf(STR))
0167 end
0168 
0169 function [geneIndex,geneHashMap]=hashGeneToProt(proteinDB)
0170 
0171 [x,y] = size(proteinDB);
0172 genesForIndex = reshape(proteinDB, x*y, 1);
0173 genesForIndex = genesForIndex(~cellfun(@isempty, genesForIndex));
0174 genesForIndex = unique(genesForIndex);
0175 geneIndex = cell(length(genesForIndex),1);
0176 geneHashMap = containers.Map(genesForIndex,1:length(genesForIndex));
0177 protIndices = 1:length(proteinDB(:,1));
0178 for i = 1:y
0179     tmp1 = proteinDB(:,i);
0180     sel = ~cellfun(@isempty, tmp1);
0181     indices = cell2mat(values(geneHashMap,tmp1(sel)));
0182     protIndicesSel = protIndices(sel);
0183     for j = 1:length(indices)
0184         geneIndex{indices(j)} = [geneIndex{indices(j)};protIndicesSel(j)];
0185     end
0186 end
0187 end
0188 end
getECfromDatabase

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE