Description of findECInDB

0001 function [EC,conflicts] = findECInDB(gene_set, DBecNum, DBMW, geneIndex, geneHashMap)
0002 % findECInDB
0003 %   Collects enzyme codes for genes from the UniProt or KEGGdatabase.
0004 %
0005 % Input:
0006 %   gene_set    genes from a given metabolic reaction
0007 %   DBecNum     array of EC numbers
0008 %   DBMW        array of molecular weights in Da
0009 %   geneIndex   as generated by getECfromDatabase
0010 %   geneHashMap as generated by getECfromDatabase
0011 %
0012 % Output:
0013 %   EC          all EC numbers found for the genes in gene_set
0014 %   conflicts   genes with multiple protein matches
0015 %
0016 % Usage:
0017 %   [EC,conflicts] = findECInDB(gene_set, DBecNum, DBMW, geneIndex, geneHashMap)
0018 
0019 %Preallocate function variables
0020 conflicts = cell(1,2);
0021 
0022 %Split the gene set and match each gene:
0023 %gene_set  = strsplit(grRule,' and '); %this string splitting together with getSimpleGeneSets takes 2.6 s, that is ok.
0024 EC_set    = cell(size(gene_set));
0025 
0026 %find index indices of all genes in one call
0027 
0028 %this loop is slow
0029 for j = 1:length(gene_set)
0030      gene = gene_set{j};
0031      matches = [];
0032      %Get the indexes for all of the proteins related to gene
0033      if isKey(geneHashMap,gene) %annoyingly, this seems to be needed
0034          matchInd = cell2mat(values(geneHashMap,gene_set(j)));
0035          matches = geneIndex{matchInd};
0036      end
0037      if ~isempty(matches)
0038         %Get the indexes of the matched protein(s) with non-empty EC#s
0039         nonEmpty = matches(~cellfun(@isempty,DBecNum(matches)));
0040         if ~isempty(nonEmpty)
0041             [geneECs,ia] = unique(DBecNum(nonEmpty),'stable');
0042             %For genes with multiple proteins associated to several
0043             %non-empty ecNumbers
0044             if length(geneECs)>1
0045                 indexes = nonEmpty(ia);
0046                 ecNum = DBecNum(indexes);
0047                 %Save first match
0048                 EC_set{j}  = getECstring(EC_set{j},ecNum{1});
0049                 %Save additional matches as potential conflicts
0050                 if ~ismember(gene,conflicts{1})
0051                     conflicts{1} = [conflicts{1}; {gene}];
0052                     conflicts{2} = [conflicts{2}; {indexes}]; 
0053                 end
0054             else
0055                 %If there is a single unique ec number for the matched
0056                 %protein(s), then choose the lightest protein
0057                 [~,minW]   = min(DBMW(nonEmpty));
0058                 matches    = nonEmpty(minW);
0059                 EC_set{j}  = getECstring(EC_set{j},DBecNum{matches});
0060             end
0061         end
0062     end
0063 
0064     if isempty(EC_set{j})
0065         EC_set{j} = '';
0066     end
0067 end
0068 
0069 %EC: Find union and intersection between all units (only applies for
0070 %complexes, i.e. length(EC_set) > 1):
0071 uni_EC = strsplit(EC_set{1},' ');
0072 int_EC = uni_EC;
0073 for j = 2:length(EC_set)
0074     other_EC = strsplit(EC_set{j},' ');
0075     if isempty(uni_EC)
0076         uni_EC = other_EC;
0077         int_EC = other_EC;
0078     elseif ~isempty(other_EC)
0079         uni_EC = compare_wild([uni_EC other_EC]);
0080         int_EC = intersection(int_EC,other_EC);
0081     end
0082 end
0083 %Use the intersection found, if any. If not, use the union:
0084 if isempty(int_EC)
0085     EC_set = uni_EC;
0086 else
0087     EC_set = int_EC;
0088 end
0089 
0090 %Add new codes as new possible isozymes and remove the initial "EC" text:
0091 
0092 for i = 1:length(EC_set)
0093    if ~isempty(EC_set{i}) 
0094        EC_set{i} = extractAfter(EC_set{i},2);%remove 2 first chars
0095    end
0096 end
0097 
0098 %EC_set = cellfun(@(x) extractAfter(x,2),EC_set,'UniformOutput',false);%remove 2 first chars
0099 EC    = strjoin(EC_set,';');
0100 
0101 
0102 %Delete repeated Uniprots (and the corresponding ECs):
0103 %EC(deleted)   = []; %Not sure if this needs to be done somehow, don't think so
0104 
0105 end
0106 
0107 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0108 function int_EC = intersection(prev_EC,new_EC)
0109 %Finds the common elements between two cell arrays, if any. Also considers
0110 %wildcards (e.g. if 'EC1.1.1.1' is in one array and 'EC1.1.1.-' is in the
0111 %other one, then 'EC1.1.1.1' is added to the intersection).
0112 int_EC = {};
0113 for i = 1:length(prev_EC)
0114     for j = 1:length(new_EC)
0115         new_int = compare_wild({prev_EC{i} new_EC{j}});
0116         if length(new_int) == 1
0117             int_EC = [int_EC new_int];
0118         end
0119     end
0120 end
0121 end
0122 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0123 function int_EC = compare_wild(EC)
0124 %Goes through a cell array of EC numbers, and erases any repetitions,
0125 %considering also wildcards (e.g. will erase 'EC1.2.3.-' if 'EC1.2.3.4' is
0126 %already present).
0127 
0128 %Trim all EC numbers of wild cards (e.g. 'EC1.2.-.-' -> 'EC1.2.'):
0129 EC_trimmed = EC;
0130 for i = 1:length(EC)
0131     %Add a final dot for avoiding issues (e.g. 'EC1.1.1.1' & 'EC1.1.1.12'):
0132     ECi = [EC{i} '.'];
0133     pos = strfind(ECi,'-');
0134     if ~isempty(pos)
0135         ECi = ECi(1:pos(1)-1);
0136     end
0137     EC_trimmed{i} = ECi;
0138 end
0139 
0140 %Compare all EC numbers between them to find if 1 fits in the other:
0141 non_repeated = true(1,length(EC));
0142 for i = 1:length(EC)-1
0143     for j = i+1:length(EC)
0144         ECi = EC_trimmed{i};
0145         ECj = EC_trimmed{j};
0146         %If ECj fits in ECi then ECj can be disregarded:
0147         if strfind(ECi,ECj)
0148             non_repeated(j) = false;
0149         %Else, if ECi fits in ECj then ECi can be disregarded:
0150         elseif strfind(ECj,ECi)
0151             non_repeated(i) = false;
0152         end
0153     end
0154 end
0155 
0156 int_EC = EC(non_repeated);
0157 
0158 end
findECInDB

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE