0001 function kcatList = fuzzyKcatMatching(model, ecRxns, modelAdapter, forceWClvl)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053 if nargin<2 || isempty(ecRxns)
0054 ecRxns = true(numel(model.ec.rxns),1);
0055 elseif isnumeric(ecRxns)
0056 ecRxnsVec = false(numel(model.ec.rxns),1);
0057 ecRxnsVec(ecRxns) = true;
0058 ecRxns = ecRxnsVec;
0059 end
0060 ecRxns=find(ecRxns);
0061
0062 if nargin < 3 || isempty(modelAdapter)
0063 modelAdapter = ModelAdapterManager.getDefault();
0064 if isempty(modelAdapter)
0065 error('Either send in a modelAdapter or set the default model adapter in the ModelAdapterManager.')
0066 end
0067 end
0068 params = modelAdapter.params;
0069
0070 if nargin < 4 || isempty(forceWClvl)
0071 forceWClvl = 0;
0072 end
0073
0074 if ~isfield(model.ec,'eccodes')
0075 error('No EC codes defined in model.ec.eccodes. First run getECfromGEM() and/or getECfromDatabase().')
0076 end
0077 eccodes = model.ec.eccodes(ecRxns);
0078 substrates = cell(numel(ecRxns),1);
0079 substrCoeffs = cell(numel(ecRxns),1);
0080
0081
0082 if ~model.ec.geckoLight
0083 rxnNames = model.ec.rxns;
0084 else
0085 rxnNames = extractAfter(model.ec.rxns, 4);
0086 end
0087 [~,originalRxns] = ismember(rxnNames(ecRxns),model.rxns);
0088 for i = 1:length(ecRxns)
0089 sel = find(model.S(:,originalRxns(i)) < 0);
0090 substrates{i} = model.metNames(sel);
0091 substrCoeffs{i} = -model.S(sel,originalRxns(i));
0092 end
0093
0094
0095 [KCATcell, SAcell] = loadBRENDAdata(modelAdapter);
0096
0097
0098
0099 phylDistStruct = KEGG_struct(modelAdapter.getPhylDistStructPath());
0100
0101 org_name = params.org_name;
0102 org_index = find_inKEGG(org_name,phylDistStruct.names);
0103
0104
0105 phylDistStruct.genus = lower(regexprep(phylDistStruct.names,'\s.*',''));
0106
0107 phylDistStruct.uniqueGenusList = unique(phylDistStruct.genus);
0108 phylDistStruct.genusHashMap = containers.Map(phylDistStruct.uniqueGenusList,1:length(phylDistStruct.uniqueGenusList));
0109 phylDistStruct.uniqueGenusIndices = cell(length(phylDistStruct.uniqueGenusList),1);
0110
0111
0112 for i = 1:length(phylDistStruct.genus)
0113 matchInd = cell2mat(values(phylDistStruct.genusHashMap,phylDistStruct.genus(i)));
0114 phylDistStruct.uniqueGenusIndices{matchInd} = [phylDistStruct.uniqueGenusIndices{matchInd};i];
0115 end
0116
0117
0118 kcats = zeros(length(eccodes),1);
0119 mM = length(eccodes);
0120
0121
0122
0123
0124 kcatInfo.info.org_s = zeros(mM,1);
0125 kcatInfo.info.rest_s = zeros(mM,1);
0126 kcatInfo.info.org_ns = zeros(mM,1);
0127 kcatInfo.info.rest_ns = zeros(mM,1);
0128 kcatInfo.info.org_sa = zeros(mM,1);
0129 kcatInfo.info.rest_sa = zeros(mM,1);
0130 kcatInfo.info.wcLevel = NaN(mM,1);
0131 kcatInfo.stats.queries = 0;
0132 kcatInfo.stats.org_s = 0;
0133 kcatInfo.stats.rest_s = 0;
0134 kcatInfo.stats.org_ns = 0;
0135 kcatInfo.stats.rest_ns = 0;
0136 kcatInfo.stats.org_sa = 0;
0137 kcatInfo.stats.rest_sa = 0;
0138 kcatInfo.stats.wc0 = 0;
0139 kcatInfo.stats.wc1 = 0;
0140 kcatInfo.stats.wc2 = 0;
0141 kcatInfo.stats.wc3 = 0;
0142 kcatInfo.stats.wc4 = 0;
0143 kcatInfo.stats.matrix = zeros(6,5);
0144
0145
0146
0147
0148
0149 [ECIndexIds,~,ic] = unique(KCATcell{1});
0150 EcIndexIndices = cell(length(ECIndexIds),1);
0151 for i = 1:length(EcIndexIndices)
0152 EcIndexIndices{i} = find(ic == i).';
0153 end
0154
0155
0156 while forceWClvl > 0
0157 eccodes=regexprep(eccodes,'(.)*(\.\d+)(\.-)*$','$1\.-$3');
0158 forceWClvl = forceWClvl - 1;
0159 end
0160 if forceWClvl == 1
0161 eccodes = regexprep(eccodes,'.*','-\.-\.-\.-');
0162 end
0163
0164 progressbar('Gathering kcat values by fuzzy matching to BRENDA database')
0165
0166 for i = 1:mM
0167
0168 EC = eccodes{i};
0169 if ~isempty(EC)
0170 EC = strsplit(EC,';');
0171
0172 if ~isempty(substrates{i})
0173 [kcats(i), kcatInfo.info,kcatInfo.stats] = iterativeMatch(EC,substrates{i},substrCoeffs{i},i,KCATcell,...
0174 kcatInfo.info,kcatInfo.stats,org_name,...
0175 phylDistStruct,org_index,SAcell,ECIndexIds,EcIndexIndices);
0176 end
0177 end
0178 progressbar(i/mM)
0179 end
0180
0181 kcatList.source = 'brenda';
0182 kcatList.rxns = model.ec.rxns(ecRxns);
0183 kcatList.substrates = substrates;
0184 kcatList.kcats = kcats;
0185 kcatList.eccodes = eccodes;
0186 kcatList.wildcardLvl = kcatInfo.info.wcLevel;
0187 kcatList.origin = NaN(numel(model.ec.rxns(ecRxns)),1);
0188
0189
0190 origin = [kcatInfo.info.org_s kcatInfo.info.rest_s kcatInfo.info.org_ns kcatInfo.info.rest_ns kcatInfo.info.org_sa kcatInfo.info.rest_sa];
0191 for i=1:6
0192 kcatList.origin(find(origin(:,i))) = i;
0193 end
0194 end
0195
0196 function [kcat,dir,tot] =iterativeMatch(EC,subs,substrCoeff,i,KCATcell,dir,tot,...
0197 name,phylDist,org_index,SAcell,ECIndexIds,EcIndexIndices)
0198
0199
0200
0201 kcat = zeros(size(EC));
0202 origin = zeros(size(EC));
0203 matches = zeros(size(EC));
0204 wc_num = ones(size(EC)).*1000;
0205 for k = 1:length(EC)
0206 success = false;
0207 while ~success
0208
0209 [kcat(k),origin(k),matches(k)] = mainMatch(EC{k},subs,substrCoeff,KCATcell,...
0210 name,phylDist,...
0211 org_index,SAcell,ECIndexIds,EcIndexIndices);
0212
0213
0214 if origin(k) > 0
0215 success = true;
0216 wc_num(k) = sum(EC{k}=='-');
0217 else
0218 dot_pos = [2 strfind(EC{k},'.')];
0219 wild_num = sum(EC{k}=='-');
0220 wc_text = '-.-.-.-';
0221 EC{k} = [EC{k}(1:dot_pos(4-wild_num)) wc_text(1:2*wild_num+1)];
0222 end
0223 end
0224 end
0225
0226 if sum(origin) > 0
0227
0228
0229 best_pos = (wc_num == min(wc_num));
0230 new_origin = origin(best_pos);
0231 best_pos = (origin == min(new_origin(new_origin~=0)));
0232 max_pos = find(kcat == max(kcat(best_pos)));
0233 wc_num = wc_num(max_pos(1));
0234 origin = origin(max_pos(1));
0235 matches = matches(max_pos(1));
0236 kcat = kcat(max_pos(1));
0237
0238
0239 dir.org_s(i) = matches*(origin == 1);
0240 dir.rest_s(i) = matches*(origin == 2);
0241 dir.org_ns(i) = matches*(origin == 3);
0242 dir.org_sa(i) = matches*(origin == 4);
0243 dir.rest_ns(i) = matches*(origin == 5);
0244 dir.rest_sa(i) = matches*(origin == 6);
0245 dir.wcLevel(i) = wc_num;
0246 tot.org_s = tot.org_s + (origin == 1);
0247 tot.rest_s = tot.rest_s + (origin == 2);
0248 tot.org_ns = tot.org_ns + (origin == 3);
0249 tot.org_sa = tot.org_sa + (origin == 4);
0250 tot.rest_ns = tot.rest_ns + (origin == 5);
0251 tot.rest_sa = tot.rest_sa + (origin == 6);
0252 tot.wc0 = tot.wc0 + (wc_num == 0);
0253 tot.wc1 = tot.wc1 + (wc_num == 1);
0254 tot.wc2 = tot.wc2 + (wc_num == 2);
0255 tot.wc3 = tot.wc3 + (wc_num == 3);
0256 tot.wc4 = tot.wc4 + (wc_num == 4);
0257 tot.queries = tot.queries + 1;
0258 tot.matrix(origin,wc_num+1) = tot.matrix(origin,wc_num+1) + 1;
0259 end
0260
0261 end
0262
0263
0264 function [kcat,origin,matches] = mainMatch(EC,subs,substrCoeff,KCATcell,...
0265 name,phylDist,org_index,SAcell,ECIndexIds,EcIndexIndices)
0266
0267
0268
0269
0270 wild = false;
0271 wild_pos = strfind(EC,'-');
0272 if ~isempty(wild_pos)
0273 EC = EC(1:wild_pos(1)-1);
0274 wild = true;
0275 end
0276 stringMatchesEC_cell = extract_string_matches(EC,KCATcell{1},wild,ECIndexIds,EcIndexIndices);
0277
0278
0279
0280
0281 origin = 0;
0282
0283 [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,name,true,false,...
0284 phylDist,org_index,SAcell,stringMatchesEC_cell,[]);
0285 if matches > 0 && ~wild
0286 origin = 1;
0287
0288 else
0289 [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,'',true,false,...
0290 phylDist,org_index,SAcell,stringMatchesEC_cell,[]);
0291 if matches > 0 && ~wild
0292 origin = 2;
0293
0294 else
0295 [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,name,false,false,...
0296 phylDist,org_index,SAcell,stringMatchesEC_cell,[]);
0297 if matches > 0
0298 origin = 3;
0299
0300 else
0301
0302 stringMatchesSA = extract_string_matches(EC,SAcell{1},wild,[],[]);
0303
0304 [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,name,false,...
0305 true,phylDist,org_index,...
0306 SAcell,stringMatchesEC_cell,stringMatchesSA);
0307 if matches > 0
0308 origin = 4;
0309
0310 else
0311 [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,'',false,...
0312 false,phylDist,...
0313 org_index,SAcell,stringMatchesEC_cell,stringMatchesSA);
0314 if matches > 0
0315 origin = 5;
0316
0317 else
0318 [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,'',...
0319 false,true,phylDist,...
0320 org_index,SAcell,stringMatchesEC_cell,stringMatchesSA);
0321 if matches > 0
0322 origin = 6;
0323 end
0324 end
0325
0326 end
0327 end
0328 end
0329 end
0330 end
0331
0332 function [kcat,matches] = matchKcat(EC,subs,substrCoeff,KCATcell,organism,...
0333 substrate,SA,phylDist,...
0334 org_index,SAcell,KCATcellMatches,SAcellMatches)
0335
0336
0337
0338 kcat = [];
0339 matches = 0;
0340
0341 if SA
0342
0343 EC_indexes = extract_indexes(SAcellMatches,[],SAcell{2},subs,substrate,...
0344 organism,org_index,phylDist);
0345
0346 kcat = SAcell{3}(EC_indexes);
0347 org_cell = SAcell{2}(EC_indexes);
0348 MW_BRENDA = SAcell{4}(EC_indexes);
0349
0350 else
0351
0352 EC_indexes = extract_indexes(KCATcellMatches,KCATcell{2},KCATcell{3},...
0353 subs,substrate,organism,org_index,...
0354 phylDist);
0355 if substrate
0356 for j = 1:length(EC_indexes)
0357 indx = EC_indexes(j);
0358 for k = 1:length(subs)
0359 if (isempty(subs{k}))
0360 break;
0361 end
0362
0363 if ~isempty(subs{k}) && strcmpi(subs{k},KCATcell{2}(indx))
0364 if KCATcell{4}(indx) > 0
0365 coeff = min(substrCoeff);
0366 kCatTmp = KCATcell{4}(indx);
0367 kcat = [kcat;kCatTmp/coeff];
0368 end
0369 end
0370 end
0371 end
0372 else
0373 kcat = KCATcell{4}(EC_indexes);
0374 end
0375 end
0376
0377 if isempty(kcat)
0378 kcat = 0;
0379 else
0380 matches = length(kcat);
0381 [kcat,MaxIndx] = max(kcat);
0382 end
0383
0384 if kcat>(1E7)
0385 kcat = 1E7;
0386 end
0387 end
0388
0389
0390
0391
0392 function EC_indexes = extract_string_matches(EC,EC_cell,wild,ECIndexIds,EcIndexIndices)
0393 EC_indexes = [];
0394 EC_indexesOld = [];
0395 if wild
0396 if (~isempty(ECIndexIds))
0397 X = find(contains(ECIndexIds, EC));
0398 for j = 1:length(X)
0399 EC_indexes = [EC_indexes,EcIndexIndices{X(j)}];
0400 end
0401 else
0402 for j=1:length(EC_cell)
0403 if strfind(EC_cell{j},EC)==1
0404 EC_indexes = [EC_indexes,j];
0405 end
0406 end
0407 end
0408 else
0409 if (~isempty(ECIndexIds))
0410 mtch = find(strcmpi(EC,ECIndexIds));
0411 if (~isempty(mtch))
0412 EC_indexes = EcIndexIndices{mtch};
0413 end
0414 else
0415 if ~isempty(EC_cell)
0416 EC_indexes = transpose(find(strcmpi(EC,EC_cell)));
0417 end
0418 end
0419 end
0420
0421 end
0422
0423
0424
0425 function EC_indexes = extract_indexes(EC_indCellStringMatches,subs_cell,orgs_cell,subs,...
0426 substrate,organism, org_index,...
0427 phylDist)
0428
0429 EC_indexes = EC_indCellStringMatches;
0430
0431
0432
0433
0434 if substrate
0435 if (~isempty(EC_indexes))
0436 Subs_indexes = [];
0437 for l = 1:length(subs)
0438 if (isempty(subs{l}))
0439 break;
0440 end
0441 Subs_indexes = horzcat(Subs_indexes,EC_indexes(strcmpi(subs(l),...
0442 subs_cell(EC_indexes))));
0443 end
0444 EC_indexes = Subs_indexes;
0445 end
0446 end
0447
0448 EC_orgs = orgs_cell(EC_indexes);
0449
0450
0451 if string(organism) ~= ''
0452 EC_indexes = EC_indexes(strcmpi(string(organism),EC_orgs));
0453
0454
0455
0456 elseif org_index~='*'
0457 KEGG_indexes = [];temp = [];
0458
0459
0460
0461
0462 for j=1:length(EC_indexes)
0463
0464 orgs_index = find(strcmpi(orgs_cell(EC_indexes(j)),phylDist.names),1);
0465 if ~isempty(orgs_index)
0466 KEGG_indexes = [KEGG_indexes; orgs_index];
0467 temp = [temp;EC_indexes(j)];
0468
0469
0470 else
0471 org = orgs_cell{EC_indexes(j)};
0472 orgGenus = lower(regexprep(org,'\s.*',''));
0473 if isKey(phylDist.genusHashMap,orgGenus)
0474 matchInd = cell2mat(values(phylDist.genusHashMap,{orgGenus}));
0475 matches = phylDist.uniqueGenusIndices{matchInd};
0476 k = matches(1);
0477 KEGG_indexes = [KEGG_indexes;k];
0478 temp = [temp;EC_indexes(j)];
0479 end
0480 end
0481 end
0482
0483 EC_indexes = temp;
0484
0485
0486 if ~isempty(EC_indexes)
0487 distances = phylDist.distMat(org_index,KEGG_indexes);
0488 EC_indexes = EC_indexes(distances == min(distances));
0489 end
0490 end
0491 end
0492
0493 function org_index = find_inKEGG(org_name,names)
0494 org_index = find(strcmpi(org_name,names));
0495 if numel(org_index)>1
0496 org_index = org_index(1);
0497 elseif isempty(org_index)
0498 org_name = regexprep(org_name,'\s.*','');
0499 org_index = find(strcmpi(org_name,names));
0500 if numel(org_index)>1
0501 org_index = org_index(1);
0502 elseif isempty(org_index)
0503 org_index = '*';
0504 end
0505 end
0506 end
0507
0508 function phylDistStruct = KEGG_struct(phylpath)
0509 load(phylpath)
0510 phylDistStruct.ids = transpose(phylDistStruct.ids);
0511 phylDistStruct.names = transpose(phylDistStruct.names);
0512 phylDistStruct.names = regexprep(phylDistStruct.names,'\s*\(.*','');
0513 end