0001 function [draftModel, hitGenes]=getModelFromHomology(models,blastStructure,...
0002 getModelFor,preferredOrder,strictness,onlyGenesInModels,maxE,...
0003 minLen,minIde,mapNewGenesToOld)
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069 hitGenes.oldGenes = [];
0070 hitGenes.newGenes = [];
0071
0072 getModelFor=char(getModelFor);
0073
0074 if nargin<4
0075 preferredOrder=[];
0076 else
0077 preferredOrder=convertCharArray(preferredOrder);
0078 [row,col]=size(preferredOrder);
0079 if col>row
0080 preferredOrder=transpose(preferredOrder);
0081 end
0082 end
0083 if nargin<5
0084 strictness=1;
0085 end
0086 if nargin<6
0087 onlyGenesInModels=false;
0088 end
0089 if nargin<7
0090 maxE=10^-30;
0091 end
0092 if nargin<8
0093 minLen=200;
0094 end
0095 if nargin<9
0096 minIde=40;
0097 end
0098 if nargin<10
0099 mapNewGenesToOld=true;
0100 end
0101
0102 if isfield(models,'S')
0103 models={models};
0104 end
0105
0106
0107 modelNames=cell(numel(models),1);
0108 for i=1:numel(models)
0109 modelNames{i}=models{i}.id;
0110
0111
0112 if isfield(models{i},'geneShortNames')
0113 models{i}=rmfield(models{i},'geneShortNames');
0114 end
0115 if isfield(models{i},'geneMiriams')
0116 models{i}=rmfield(models{i},'geneMiriams');
0117 end
0118
0119
0120 if isfield(models{i},'geneFrom')
0121 models{i}=rmfield(models{i},'geneFrom');
0122 end
0123 end
0124
0125
0126 for i=1:numel(blastStructure)
0127 problemGenes = startsWith(blastStructure(i).fromGenes,'(') | endsWith(blastStructure(i).fromGenes,')');
0128 if any(problemGenes)
0129 error(['One or multiple gene identifiers from ' blastStructure(i).fromId ...
0130 ' starts with ''('' and/or ends with '')'', which is not allowed'])
0131 end
0132 end
0133
0134
0135
0136
0137
0138
0139 for i=1:numel(blastStructure)
0140 if ~strcmp(blastStructure(i).fromId,getModelFor)
0141 j=strcmpi(blastStructure(i).fromId,modelNames);
0142 if j==0
0143 error(['While the blastStructure contains sequences from '...
0144 'organismID "%s" (as\nprovided in getBlast), none of '...
0145 'template models have this id (as model.id)'],...
0146 string(blastStructure(i).fromId));
0147 end
0148 k=sum(ismember(blastStructure(i).fromGenes,models{j}.genes));
0149 if k<(numel(models{j}.genes)*0.05)
0150 error(['Less than 5%% of the genes in the template model '...
0151 'with model.id "%s"\ncan be found in the blastStructure. '...
0152 'Ensure that the protein FASTA\nused in getBlast and '...
0153 'the template model used in getModelFromHomology\nuse '...
0154 'the same style of gene identifiers'],models{j}.id)
0155 end
0156 end
0157 end
0158
0159
0160 for i=1:length(models)
0161 fprintf('\nStandardizing grRules of template model with ID "%s" ...',models{i}.id);
0162 [models{i}.grRules,models{i}.rxnGeneMat]=standardizeGrRules(models{i},false);
0163 end
0164 fprintf(' done\n');
0165
0166
0167 for i=1:numel(blastStructure)
0168 indexes=blastStructure(i).evalue<maxE & blastStructure(i).aligLen>=minLen & blastStructure(i).identity>=minIde;
0169 blastStructure(i).fromGenes(~indexes)=[];
0170 blastStructure(i).toGenes(~indexes)=[];
0171 blastStructure(i).evalue(~indexes)=[];
0172 blastStructure(i).identity(~indexes)=[];
0173 blastStructure(i).aligLen(~indexes)=[];
0174 blastStructure(i).bitscore(~indexes)=[];
0175 blastStructure(i).ppos(~indexes)=[];
0176 end
0177
0178
0179
0180
0181 for i=1:numel(models)
0182 [hasGenes, ~]=find(models{i}.rxnGeneMat);
0183 hasNoGenes=1:numel(models{i}.rxns);
0184 hasNoGenes(hasGenes)=[];
0185 models{i}=removeReactions(models{i},hasNoGenes,true,true);
0186 end
0187
0188
0189
0190
0191
0192
0193 allGenes=cell(numel(models)+1,1);
0194 if isempty(preferredOrder)
0195 useOrder=modelNames;
0196 else
0197 useOrder=preferredOrder;
0198 end
0199
0200
0201 useOrderIndexes=zeros(numel(models),1);
0202 for i=1:numel(models)
0203 [~, index]=ismember(models{i}.id,useOrder);
0204 useOrderIndexes(index)=i;
0205 end
0206
0207
0208 if onlyGenesInModels==true
0209 modelGenes={};
0210 for i=1:numel(models)
0211 modelGenes=[modelGenes;models{i}.genes];
0212 end
0213 for i=1:numel(blastStructure)
0214
0215 if strcmpi(blastStructure(i).fromId,getModelFor)
0216 I=ismember(blastStructure(i).toGenes,modelGenes);
0217 else
0218 I=ismember(blastStructure(i).fromGenes,modelGenes);
0219 end
0220 blastStructure(i).fromGenes(~I)=[];
0221 blastStructure(i).toGenes(~I)=[];
0222 blastStructure(i).evalue(~I)=[];
0223 blastStructure(i).identity(~I)=[];
0224 blastStructure(i).aligLen(~I)=[];
0225 blastStructure(i).bitscore(~I)=[];
0226 blastStructure(i).ppos(~I)=[];
0227
0228
0229
0230 if isempty(blastStructure(i).fromGenes)
0231 EM=['No genes in matching from ' blastStructure(i).fromId ' to ' blastStructure(i).toId ' are present in the corresponding model'];
0232 dispEM(EM);
0233 end
0234 end
0235 end
0236
0237
0238
0239
0240
0241 if strictness==3
0242 for i=1:numel(blastStructure)
0243 keep=false(numel(blastStructure(i).toGenes),1);
0244 [allFromGenes, ~, I]=unique(blastStructure(i).fromGenes);
0245
0246
0247 for j=1:numel(allFromGenes)
0248 allMatches=find(I==j);
0249 bestMatches=allMatches(blastStructure(i).evalue(allMatches)==min(blastStructure(i).evalue(allMatches)));
0250
0251
0252 keep(bestMatches)=true;
0253 end
0254
0255
0256 blastStructure(i).fromGenes(~keep)=[];
0257 blastStructure(i).toGenes(~keep)=[];
0258 blastStructure(i).evalue(~keep)=[];
0259 blastStructure(i).identity(~keep)=[];
0260 blastStructure(i).aligLen(~keep)=[];
0261 blastStructure(i).bitscore(~keep)=[];
0262 blastStructure(i).ppos(~keep)=[];
0263 end
0264 end
0265
0266 useOrder=[{getModelFor};useOrder];
0267
0268 for i=1:numel(blastStructure)
0269 [~, toIndex]=ismember(blastStructure(i).toId,useOrder);
0270 [~, fromIndex]=ismember(blastStructure(i).fromId,useOrder);
0271
0272
0273 allGenes{toIndex}=[allGenes{toIndex};blastStructure(i).toGenes];
0274 allGenes{fromIndex}=[allGenes{fromIndex};blastStructure(i).fromGenes];
0275 end
0276
0277
0278 maxOtherGeneNr=0;
0279 for i=1:numel(allGenes)
0280 allGenes{i}=unique(allGenes{i});
0281 if i>1
0282 if numel(allGenes{i})>maxOtherGeneNr
0283 maxOtherGeneNr=numel(allGenes{i});
0284 end
0285 end
0286 end
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298 allTo=cell(numel(useOrder)-1,1);
0299 allFrom=cell(numel(useOrder)-1,1);
0300
0301 for i=1:numel(useOrder)-1
0302 allTo{i}=sparse(numel(allGenes{1}),numel(allGenes{i+1}));
0303 allFrom{i}=sparse(numel(allGenes{1}),numel(allGenes{i+1}));
0304 end
0305
0306
0307 for i=1:numel(blastStructure)
0308 if strcmp(blastStructure(i).toId,getModelFor)
0309
0310
0311 [~, a]=ismember(blastStructure(i).toGenes,allGenes{1});
0312 [~, fromModel]=ismember(blastStructure(i).fromId,useOrder);
0313 [~, b]=ismember(blastStructure(i).fromGenes,allGenes{fromModel});
0314 idx = sub2ind(size(allTo{fromModel-1}), a, b);
0315 allTo{fromModel-1}(idx)=1;
0316 else
0317
0318 [~, a]=ismember(blastStructure(i).fromGenes,allGenes{1});
0319 [~, toModel]=ismember(blastStructure(i).toId,useOrder);
0320 [~, b]=ismember(blastStructure(i).toGenes,allGenes{toModel});
0321 idx = sub2ind(size(allFrom{toModel-1}), a, b);
0322 allFrom{toModel-1}(idx)=1;
0323 end
0324 end
0325
0326
0327
0328
0329
0330
0331
0332 finalMappings=cell(numel(useOrder)-1,1);
0333 if strictness==1 || strictness==3
0334 for j=1:numel(allFrom)
0335 finalMappings{j}=allTo{j}~=0 & allFrom{j}~=0;
0336 end
0337 else
0338 if mapNewGenesToOld==true
0339 finalMappings=allFrom;
0340 else
0341 finalMappings=allTo;
0342 end
0343 end
0344
0345
0346
0347
0348
0349 usedNewGenes=false(numel(allGenes{1}),1);
0350
0351 for i=1:numel(allGenes)-1
0352
0353 if onlyGenesInModels==false
0354 a=ismember(allGenes{i+1},models{useOrderIndexes(i)}.genes);
0355 finalMappings{i}(:,~a)=false;
0356 end
0357
0358
0359 [a, b]=find(finalMappings{i});
0360 usedGenes=false(numel(allGenes{i+1}),1);
0361 usedNewGenes(a)=true;
0362 usedGenes(b)=true;
0363 finalMappings{i}=finalMappings{i}(:,usedGenes);
0364 allGenes{i+1}=allGenes{i+1}(usedGenes);
0365 end
0366
0367
0368 allGenes{1}=allGenes{1}(usedNewGenes);
0369 for i=1:numel(finalMappings)
0370 finalMappings{i}=finalMappings{i}(usedNewGenes,:);
0371 end
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383 for i=1:numel(models)
0384 a=ismember(models{useOrderIndexes(i)}.genes,allGenes{i+1});
0385
0386
0387
0388
0389 [rxnsToKeep,~] = find(models{useOrderIndexes(i)}.rxnGeneMat(:,a));
0390 rxnsToRemove = repmat(1,numel(models{useOrderIndexes(i)}.rxns),1);
0391 rxnsToRemove(rxnsToKeep) = 0;
0392 rxnsToRemove = find(rxnsToRemove);
0393 models{useOrderIndexes(i)}=removeReactions(models{useOrderIndexes(i)},rxnsToRemove,true,true,true);
0394 end
0395
0396
0397
0398
0399
0400
0401 allUsedGenes=false(numel(allGenes{1}),1);
0402
0403 if ~isempty(preferredOrder) && numel(models)>1
0404 [usedGenes, ~]=find(finalMappings{1});
0405 allUsedGenes(usedGenes)=true;
0406 for i=2:numel(finalMappings)
0407 [usedGenes, ~]=find(finalMappings{i});
0408 usedGenes=unique(usedGenes);
0409 a=ismember(usedGenes,find(allUsedGenes));
0410
0411 [~, genesToDelete]=find(finalMappings{i}(usedGenes(a),:));
0412 genesToDelete=unique(genesToDelete);
0413
0414
0415
0416 models{useOrderIndexes(i)}=removeGenes(models{useOrderIndexes(i)},allGenes{i+1}(genesToDelete),true,true,false);
0417 allUsedGenes(usedGenes)=true;
0418
0419
0420 finalMappings{i}(:,genesToDelete)=[];
0421 allGenes{i+1}(genesToDelete)=[];
0422 end
0423 end
0424
0425
0426
0427 for i=1:numel(models)
0428
0429 [newGenes, oldGenes]=find(finalMappings{i});
0430
0431
0432
0433 replaceableGenes=allGenes{i+1}(unique(oldGenes));
0434 nonReplaceableGenes=setdiff(models{useOrderIndexes(i)}.genes,replaceableGenes);
0435 fullGeneList=[allGenes{1}(unique(newGenes));nonReplaceableGenes];
0436
0437
0438
0439 nonRepStartIndex=numel(unique(newGenes));
0440
0441
0442 newRxnGeneMat=sparse(numel(models{useOrderIndexes(i)}.rxns),numel(fullGeneList));
0443
0444
0445
0446
0447 for j=1:numel(models{useOrderIndexes(i)}.rxns)
0448
0449 [~, oldGeneIds]=find(models{useOrderIndexes(i)}.rxnGeneMat(j,:));
0450
0451
0452
0453 for k=1:numel(oldGeneIds)
0454
0455
0456
0457
0458 geneName=models{useOrderIndexes(i)}.genes(oldGeneIds(k));
0459
0460
0461 mapIndex=find(ismember(allGenes{i+1},geneName));
0462
0463 if ~isempty(mapIndex)
0464
0465 hitGenes.oldGenes = [hitGenes.oldGenes, {geneName}];
0466
0467
0468 a=find(finalMappings{i}(:,mapIndex));
0469
0470
0471 [~, b]=ismember(allGenes{1}(a),fullGeneList);
0472
0473
0474 newRxnGeneMat(j,b)=1;
0475
0476
0477
0478
0479
0480 repString=fullGeneList{b(1)};
0481 if numel(b)>1
0482 for l=2:numel(b)
0483 repString=[repString ' or ' fullGeneList{b(l)}];
0484 end
0485 repString=['(' repString ')'];
0486 end
0487
0488
0489 hitGenes.newGenes = [hitGenes.newGenes, {repString}];
0490
0491
0492 models{useOrderIndexes(i)}.grRules{j}=regexprep(models{useOrderIndexes(i)}.grRules{j},['(^|\s|\()' geneName{1} '($|\s|\))'],['$1' repString '$2']);
0493 else
0494
0495
0496 index=find(ismember(nonReplaceableGenes,geneName));
0497
0498
0499 newRxnGeneMat(j,nonRepStartIndex+index)=1;
0500
0501 models{useOrderIndexes(i)}.grRules{j}=strrep(models{useOrderIndexes(i)}.grRules{j},geneName{1},strcat('OLD_',models{useOrderIndexes(i)}.id,'_',geneName{1}));
0502 end
0503 end
0504 end
0505
0506
0507 models{useOrderIndexes(i)}.rxnGeneMat=newRxnGeneMat;
0508 if ~isempty(nonReplaceableGenes)
0509 models{useOrderIndexes(i)}.genes=[allGenes{1}(unique(newGenes));strcat('OLD_',models{useOrderIndexes(i)}.id,'_',nonReplaceableGenes)];
0510 else
0511 models{useOrderIndexes(i)}.genes=allGenes{1}(unique(newGenes));
0512 end
0513 if isfield(models{useOrderIndexes(i)},'geneComps')
0514 geneComps=models{useOrderIndexes(i)}.geneComps(1);
0515 models{useOrderIndexes(i)}.geneComps=zeros(numel(models{useOrderIndexes(i)}.genes),1);
0516
0517
0518 models{useOrderIndexes(i)}.geneComps(:)=geneComps;
0519 end
0520 end
0521
0522
0523
0524 draftModel=mergeModels(models,'metNames');
0525
0526
0527 regexStr=['OLD_(', strjoin(modelNames(:),'|'),')_(\S^\))+'];
0528 draftModel.grRules=regexprep(draftModel.grRules,[' or ' regexStr],'');
0529 draftModel.grRules=regexprep(draftModel.grRules,[regexStr ' or '],'');
0530
0531
0532 draftModel.id=getModelFor;
0533 name='Generated by getModelFromHomology using ';
0534 for i=1:numel(models)
0535 if i<numel(models)
0536 name=[name models{i}.id ', '];
0537 else
0538 name=[name models{i}.id];
0539 end
0540 end
0541 draftModel.name=name;
0542 draftModel.rxnNotes=cell(length(draftModel.rxns),1);
0543 draftModel.rxnNotes(:)={'Included by getModelFromHomology'};
0544 draftModel.rxnConfidenceScores=NaN(length(draftModel.rxns),1);
0545 draftModel.rxnConfidenceScores(:)=2;
0546 draftModel=deleteUnusedGenes(draftModel,0);
0547
0548 [draftModel.grRules,draftModel.rxnGeneMat]=standardizeGrRules(draftModel,false);
0549 draftModel=deleteUnusedGenes(draftModel,false);
0550 end