0001 function [draftModel, hitGenes]=getModelFromHomology(models,blastStructure,...
0002 getModelFor,preferredOrder,strictness,onlyGenesInModels,maxE,...
0003 minLen,minIde,mapNewGenesToOld)
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069 hitGenes.oldGenes = [];
0070 hitGenes.newGenes = [];
0071
0072 getModelFor=char(getModelFor);
0073
0074 if nargin<4
0075 preferredOrder=[];
0076 else
0077 preferredOrder=convertCharArray(preferredOrder);
0078 [row,col]=size(preferredOrder);
0079 if col>row
0080 preferredOrder=transpose(preferredOrder);
0081 end
0082 end
0083 if nargin<5
0084 strictness=1;
0085 end
0086 if nargin<6
0087 onlyGenesInModels=false;
0088 end
0089 if nargin<7
0090 maxE=10^-30;
0091 end
0092 if nargin<8
0093 minLen=200;
0094 end
0095 if nargin<9
0096 minIde=40;
0097 end
0098 if nargin<10
0099 mapNewGenesToOld=true;
0100 end
0101
0102 if isfield(models,'S')
0103 models={models};
0104 end
0105
0106
0107 modelNames=cell(numel(models),1);
0108 for i=1:numel(models)
0109 modelNames{i}=models{i}.id;
0110
0111
0112 if isfield(models{i},'geneShortNames')
0113 models{i}=rmfield(models{i},'geneShortNames');
0114 end
0115 if isfield(models{i},'geneMiriams')
0116 models{i}=rmfield(models{i},'geneMiriams');
0117 end
0118 if isfield(models{i},'proteins')
0119 models{i}=rmfield(models{i},'proteins');
0120 end
0121
0122
0123 if isfield(models{i},'geneFrom')
0124 models{i}=rmfield(models{i},'geneFrom');
0125 end
0126 end
0127
0128
0129 for i=1:numel(blastStructure)
0130 problemGenes = startsWith(blastStructure(i).fromGenes,'(') | endsWith(blastStructure(i).fromGenes,')');
0131 if any(problemGenes)
0132 error(['One or multiple gene identifiers from ' blastStructure(i).fromId ...
0133 ' starts with ''('' and/or ends with '')'', which is not allowed'])
0134 end
0135 end
0136
0137
0138
0139
0140
0141
0142 for i=1:numel(blastStructure)
0143 if ~strcmp(blastStructure(i).fromId,getModelFor)
0144 j=strcmpi(blastStructure(i).fromId,modelNames);
0145 if j==0
0146 error(['While the blastStructure contains sequences from '...
0147 'organismID "%s" (as\nprovided in getBlast), none of '...
0148 'template models have this id (as model.id)'],...
0149 string(blastStructure(i).fromId));
0150 end
0151 k=sum(ismember(blastStructure(i).fromGenes,models{j}.genes));
0152 if k<(numel(models{j}.genes)*0.05)
0153 error(['Less than 5%% of the genes in the template model '...
0154 'with model.id "%s"\ncan be found in the blastStructure. '...
0155 'Ensure that the protein FASTA\nused in getBlast and '...
0156 'the template model used in getModelFromHomology\nuse '...
0157 'the same style of gene identifiers'],models{j}.id)
0158 end
0159 end
0160 end
0161
0162
0163 for i=1:length(models)
0164 fprintf('\nStandardizing grRules of template model with ID "%s" ...',models{i}.id);
0165 [models{i}.grRules,models{i}.rxnGeneMat]=standardizeGrRules(models{i},false);
0166 end
0167 fprintf(' done\n');
0168
0169
0170 for i=1:numel(blastStructure)
0171 indexes=blastStructure(i).evalue<maxE & blastStructure(i).aligLen>=minLen & blastStructure(i).identity>=minIde;
0172 blastStructure(i).fromGenes(~indexes)=[];
0173 blastStructure(i).toGenes(~indexes)=[];
0174 blastStructure(i).evalue(~indexes)=[];
0175 blastStructure(i).identity(~indexes)=[];
0176 blastStructure(i).aligLen(~indexes)=[];
0177 blastStructure(i).bitscore(~indexes)=[];
0178 blastStructure(i).ppos(~indexes)=[];
0179 end
0180
0181
0182
0183
0184 for i=1:numel(models)
0185 [hasGenes, ~]=find(models{i}.rxnGeneMat);
0186 hasNoGenes=1:numel(models{i}.rxns);
0187 hasNoGenes(hasGenes)=[];
0188 models{i}=removeReactions(models{i},hasNoGenes,true,true);
0189 end
0190
0191
0192
0193
0194
0195
0196 allGenes=cell(numel(models)+1,1);
0197 if isempty(preferredOrder)
0198 useOrder=modelNames;
0199 else
0200 useOrder=preferredOrder;
0201 end
0202
0203
0204 useOrderIndexes=zeros(numel(models),1);
0205 for i=1:numel(models)
0206 [~, index]=ismember(models{i}.id,useOrder);
0207 useOrderIndexes(index)=i;
0208 end
0209
0210
0211 if onlyGenesInModels==true
0212 modelGenes={};
0213 for i=1:numel(models)
0214 modelGenes=[modelGenes;models{i}.genes];
0215 end
0216 for i=1:numel(blastStructure)
0217
0218 if strcmpi(blastStructure(i).fromId,getModelFor)
0219 I=ismember(blastStructure(i).toGenes,modelGenes);
0220 else
0221 I=ismember(blastStructure(i).fromGenes,modelGenes);
0222 end
0223 blastStructure(i).fromGenes(~I)=[];
0224 blastStructure(i).toGenes(~I)=[];
0225 blastStructure(i).evalue(~I)=[];
0226 blastStructure(i).identity(~I)=[];
0227 blastStructure(i).aligLen(~I)=[];
0228 blastStructure(i).bitscore(~I)=[];
0229 blastStructure(i).ppos(~I)=[];
0230
0231
0232
0233 if isempty(blastStructure(i).fromGenes)
0234 EM=['No genes in matching from ' blastStructure(i).fromId ' to ' blastStructure(i).toId ' are present in the corresponding model'];
0235 dispEM(EM);
0236 end
0237 end
0238 end
0239
0240
0241
0242
0243
0244 if strictness==3
0245 for i=1:numel(blastStructure)
0246 keep=false(numel(blastStructure(i).toGenes),1);
0247 [allFromGenes, ~, I]=unique(blastStructure(i).fromGenes);
0248
0249
0250 for j=1:numel(allFromGenes)
0251 allMatches=find(I==j);
0252 bestMatches=allMatches(blastStructure(i).evalue(allMatches)==min(blastStructure(i).evalue(allMatches)));
0253
0254
0255 keep(bestMatches)=true;
0256 end
0257
0258
0259 blastStructure(i).fromGenes(~keep)=[];
0260 blastStructure(i).toGenes(~keep)=[];
0261 blastStructure(i).evalue(~keep)=[];
0262 blastStructure(i).identity(~keep)=[];
0263 blastStructure(i).aligLen(~keep)=[];
0264 blastStructure(i).bitscore(~keep)=[];
0265 blastStructure(i).ppos(~keep)=[];
0266 end
0267 end
0268
0269 useOrder=[{getModelFor};useOrder];
0270
0271 for i=1:numel(blastStructure)
0272 [~, toIndex]=ismember(blastStructure(i).toId,useOrder);
0273 [~, fromIndex]=ismember(blastStructure(i).fromId,useOrder);
0274
0275
0276 allGenes{toIndex}=[allGenes{toIndex};blastStructure(i).toGenes];
0277 allGenes{fromIndex}=[allGenes{fromIndex};blastStructure(i).fromGenes];
0278 end
0279
0280
0281 maxOtherGeneNr=0;
0282 for i=1:numel(allGenes)
0283 allGenes{i}=unique(allGenes{i});
0284 if i>1
0285 if numel(allGenes{i})>maxOtherGeneNr
0286 maxOtherGeneNr=numel(allGenes{i});
0287 end
0288 end
0289 end
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301 allTo=cell(numel(useOrder)-1,1);
0302 allFrom=cell(numel(useOrder)-1,1);
0303
0304 for i=1:numel(useOrder)-1
0305 allTo{i}=sparse(numel(allGenes{1}),numel(allGenes{i+1}));
0306 allFrom{i}=sparse(numel(allGenes{1}),numel(allGenes{i+1}));
0307 end
0308
0309
0310 for i=1:numel(blastStructure)
0311 if strcmp(blastStructure(i).toId,getModelFor)
0312
0313
0314 [~, a]=ismember(blastStructure(i).toGenes,allGenes{1});
0315 [~, fromModel]=ismember(blastStructure(i).fromId,useOrder);
0316 [~, b]=ismember(blastStructure(i).fromGenes,allGenes{fromModel});
0317 idx = sub2ind(size(allTo{fromModel-1}), a, b);
0318 allTo{fromModel-1}(idx)=1;
0319 else
0320
0321 [~, a]=ismember(blastStructure(i).fromGenes,allGenes{1});
0322 [~, toModel]=ismember(blastStructure(i).toId,useOrder);
0323 [~, b]=ismember(blastStructure(i).toGenes,allGenes{toModel});
0324 idx = sub2ind(size(allFrom{toModel-1}), a, b);
0325 allFrom{toModel-1}(idx)=1;
0326 end
0327 end
0328
0329
0330
0331
0332
0333
0334
0335 finalMappings=cell(numel(useOrder)-1,1);
0336 if strictness==1 || strictness==3
0337 for j=1:numel(allFrom)
0338 finalMappings{j}=allTo{j}~=0 & allFrom{j}~=0;
0339 end
0340 else
0341 if mapNewGenesToOld==true
0342 finalMappings=allFrom;
0343 else
0344 finalMappings=allTo;
0345 end
0346 end
0347
0348
0349
0350
0351
0352 usedNewGenes=false(numel(allGenes{1}),1);
0353
0354 for i=1:numel(allGenes)-1
0355
0356 if onlyGenesInModels==false
0357 a=ismember(allGenes{i+1},models{useOrderIndexes(i)}.genes);
0358 finalMappings{i}(:,~a)=false;
0359 end
0360
0361
0362 [a, b]=find(finalMappings{i});
0363 usedGenes=false(numel(allGenes{i+1}),1);
0364 usedNewGenes(a)=true;
0365 usedGenes(b)=true;
0366 finalMappings{i}=finalMappings{i}(:,usedGenes);
0367 allGenes{i+1}=allGenes{i+1}(usedGenes);
0368 end
0369
0370
0371 allGenes{1}=allGenes{1}(usedNewGenes);
0372 for i=1:numel(finalMappings)
0373 finalMappings{i}=finalMappings{i}(usedNewGenes,:);
0374 end
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386 for i=1:numel(models)
0387 a=ismember(models{useOrderIndexes(i)}.genes,allGenes{i+1});
0388
0389
0390
0391
0392 [rxnsToKeep,~] = find(models{useOrderIndexes(i)}.rxnGeneMat(:,a));
0393 rxnsToRemove = repmat(1,numel(models{useOrderIndexes(i)}.rxns),1);
0394 rxnsToRemove(rxnsToKeep) = 0;
0395 rxnsToRemove = find(rxnsToRemove);
0396 models{useOrderIndexes(i)}=removeReactions(models{useOrderIndexes(i)},rxnsToRemove,true,true,true);
0397 end
0398
0399
0400
0401
0402
0403
0404 allUsedGenes=false(numel(allGenes{1}),1);
0405
0406 if ~isempty(preferredOrder) && numel(models)>1
0407 [usedGenes, ~]=find(finalMappings{1});
0408 allUsedGenes(usedGenes)=true;
0409 for i=2:numel(finalMappings)
0410 [usedGenes, ~]=find(finalMappings{i});
0411 usedGenes=unique(usedGenes);
0412 a=ismember(usedGenes,find(allUsedGenes));
0413
0414 [~, genesToDelete]=find(finalMappings{i}(usedGenes(a),:));
0415 genesToDelete=unique(genesToDelete);
0416
0417
0418
0419 models{useOrderIndexes(i)}=removeGenes(models{useOrderIndexes(i)},allGenes{i+1}(genesToDelete),true,true,false);
0420 allUsedGenes(usedGenes)=true;
0421
0422
0423 finalMappings{i}(:,genesToDelete)=[];
0424 allGenes{i+1}(genesToDelete)=[];
0425 end
0426 end
0427
0428
0429
0430 for i=1:numel(models)
0431
0432 [newGenes, oldGenes]=find(finalMappings{i});
0433
0434
0435
0436 replaceableGenes=allGenes{i+1}(unique(oldGenes));
0437 nonReplaceableGenes=setdiff(models{useOrderIndexes(i)}.genes,replaceableGenes);
0438 fullGeneList=[allGenes{1}(unique(newGenes));nonReplaceableGenes];
0439
0440
0441
0442 nonRepStartIndex=numel(unique(newGenes));
0443
0444
0445 newRxnGeneMat=sparse(numel(models{useOrderIndexes(i)}.rxns),numel(fullGeneList));
0446
0447
0448
0449
0450 for j=1:numel(models{useOrderIndexes(i)}.rxns)
0451
0452 [~, oldGeneIds]=find(models{useOrderIndexes(i)}.rxnGeneMat(j,:));
0453
0454
0455
0456 for k=1:numel(oldGeneIds)
0457
0458
0459
0460
0461 geneName=models{useOrderIndexes(i)}.genes(oldGeneIds(k));
0462
0463
0464 mapIndex=find(ismember(allGenes{i+1},geneName));
0465
0466 if ~isempty(mapIndex)
0467
0468 hitGenes.oldGenes = [hitGenes.oldGenes, {geneName}];
0469
0470
0471 a=find(finalMappings{i}(:,mapIndex));
0472
0473
0474 [~, b]=ismember(allGenes{1}(a),fullGeneList);
0475
0476
0477 newRxnGeneMat(j,b)=1;
0478
0479
0480
0481
0482
0483 repString=fullGeneList{b(1)};
0484 if numel(b)>1
0485 for l=2:numel(b)
0486 repString=[repString ' or ' fullGeneList{b(l)}];
0487 end
0488 repString=['(' repString ')'];
0489 end
0490
0491
0492 hitGenes.newGenes = [hitGenes.newGenes, {repString}];
0493
0494
0495 models{useOrderIndexes(i)}.grRules{j}=regexprep(models{useOrderIndexes(i)}.grRules{j},['(^|\s|\()' geneName{1} '($|\s|\))'],['$1' repString '$2']);
0496 else
0497
0498
0499 index=find(ismember(nonReplaceableGenes,geneName));
0500
0501
0502 newRxnGeneMat(j,nonRepStartIndex+index)=1;
0503
0504 models{useOrderIndexes(i)}.grRules{j}=strrep(models{useOrderIndexes(i)}.grRules{j},geneName{1},strcat('OLD_',models{useOrderIndexes(i)}.id,'_',geneName{1}));
0505 end
0506 end
0507 end
0508
0509
0510 models{useOrderIndexes(i)}.rxnGeneMat=newRxnGeneMat;
0511 if ~isempty(nonReplaceableGenes)
0512 models{useOrderIndexes(i)}.genes=[allGenes{1}(unique(newGenes));strcat('OLD_',models{useOrderIndexes(i)}.id,'_',nonReplaceableGenes)];
0513 else
0514 models{useOrderIndexes(i)}.genes=allGenes{1}(unique(newGenes));
0515 end
0516 if isfield(models{useOrderIndexes(i)},'geneComps')
0517 geneComps=models{useOrderIndexes(i)}.geneComps(1);
0518 models{useOrderIndexes(i)}.geneComps=zeros(numel(models{useOrderIndexes(i)}.genes),1);
0519
0520
0521 models{useOrderIndexes(i)}.geneComps(:)=geneComps;
0522 end
0523 end
0524
0525
0526
0527 draftModel=mergeModels(models,'metNames');
0528
0529
0530 regexStr=['OLD_(', strjoin(modelNames(:),'|'),')_(\S^\))+'];
0531 draftModel.grRules=regexprep(draftModel.grRules,[' or ' regexStr],'');
0532 draftModel.grRules=regexprep(draftModel.grRules,[regexStr ' or '],'');
0533
0534
0535 draftModel.id=getModelFor;
0536 name='Generated by getModelFromHomology using ';
0537 for i=1:numel(models)
0538 if i<numel(models)
0539 name=[name models{i}.id ', '];
0540 else
0541 name=[name models{i}.id];
0542 end
0543 end
0544 draftModel.name=name;
0545 draftModel.rxnNotes=cell(length(draftModel.rxns),1);
0546 draftModel.rxnNotes(:)={'Included by getModelFromHomology'};
0547 draftModel.rxnConfidenceScores=NaN(length(draftModel.rxns),1);
0548 draftModel.rxnConfidenceScores(:)=2;
0549 draftModel=deleteUnusedGenes(draftModel,0);
0550
0551 [draftModel.grRules,draftModel.rxnGeneMat]=standardizeGrRules(draftModel,false);
0552 draftModel=deleteUnusedGenes(draftModel,false);
0553 end