Description of predictLocalization

0001 function [outModel, geneLocalization, transportStruct, scores,...
0002     removedRxns] = predictLocalization(model, GSS,...
0003     defaultCompartment, transportCost, maxTime, plotResults)
0004 % predictLocalization
0005 %    Tries to assign reactions to compartments in a manner that is in
0006 %    agreement with localization predictors while at the same time
0007 %    maintaining connectivity.
0008 %
0009 %   Input:
0010 %    model                   a model structure. If the model contains
0011 %                           several compartments they will be merged
0012 %    GSS                     a gene scoring structure as from parseScores
0013 %    defaultCompartment      transport reactions are expressed as diffusion
0014 %                           between the defaultCompartment and the others.
0015 %                           This is usually the cytosol. The default
0016 %                           compartment must have a match in GSS
0017 %    transportCost           the cost for including a transport reaction. If
0018 %                           this a scalar then the same cost is used for
0019 %                           all metabolites. It can also be a vector of
0020 %                           costs with the same dimension as model.mets.
0021 %                           Note that negative costs will result in that
0022 %                           transport of the metabolite is encouraged (optional,
0023 %                           default 0.5)
0024 %    maxTime                 maximum optimization time in minutes (optional,
0025 %                           default 15)
0026 %    plotResults             true if the results should be plotted during the
0027 %                           optimization (optional, default false)
0028 %
0029 %   Output:
0030 %    outModel                the resulting model structure
0031 %    geneLocalization        structure with the genes and their resulting
0032 %                           localization
0033 %    transportStruct         structure with the transport reactions that had
0034 %                           to be inferred and between which compartments
0035 %    scores                  structure that contains the total score history
0036 %                           together with the score based on gene
0037 %                           localization and the score based on included
0038 %                           transport reactions
0039 %    removedRxns             cell array with the reaction ids that had to be
0040 %                           removed in order to have a connected input
0041 %                           model
0042 %
0043 %    This function requires that the starting network is connected when it
0044 %    is in one compartment. Reactions that are unconnected are removed and
0045 %    saved in removedRxns. Try running fillGaps to have a more connected
0046 %    input model if there are many such reactions. The input model should
0047 %    also not include any exchange, demand or sink reactions, otherwise this
0048 %    function would not provide any results.
0049 %
0050 %    In the final model all metabolites are produced in at least one
0051 %    reaction. This does not guarantee a fully functional model since there
0052 %    can be internal loops. Transport reactions are only included as passive
0053 %    diffusion (A <=> B).
0054 %
0055 %    The score of a model is the sum of scores for all genes in their
0056 %    assigned compartment minus the cost of all transport reactions that had
0057 %    to be included. A gene can only be assigned to one compartment. This is
0058 %    a simplification to keep the problem size down. The problem is solved
0059 %    using simulated annealing.
0060 %
0061 % Usage: [outModel, geneLocalization, transportStruct, scores,...
0062 %       removedRxns] = predictLocalization(model, GSS,...
0063 %       defaultCompartment, transportCost, maxTime, plotResults)
0064 
0065 if nargin<4
0066     transportCost=ones(numel(model.mets),1)*0.5;
0067 end
0068 if numel(transportCost)==1
0069     transportCost=ones(numel(model.mets),1)*transportCost;
0070 end
0071 transportCost=transportCost(:);
0072 
0073 if numel(transportCost)~=numel(model.mets)
0074     EM='The vector of transport costs must have the same dimension as model.mets';
0075     dispEM(EM,true);
0076 end
0077 if nargin<5
0078     maxTime=15;
0079 end
0080 if nargin<6
0081     plotResults=false;
0082 end
0083 
0084 if isfield(model,'rxnComps')
0085     model=rmfield(model,'rxnComps');
0086     EM='The model structure contains information about reaction compartmentalization. This is not supported by this function. The rxnComps field has been deleted';
0087     dispEM(EM,false);
0088 end
0089 if isfield(model,'geneComps')
0090     model=rmfield(model,'geneComps');
0091     EM='The model structure contains information about gene compartmentalization. This is not supported by this function. The geneComps field has been deleted';
0092     dispEM(EM,false);
0093 end
0094 
0095 defaultCompartment=char(defaultCompartment);
0096 I=ismember(defaultCompartment,GSS.compartments);
0097 if I==false
0098     EM='defaultCompartment not found in GSS';
0099     dispEM(EM);
0100 end
0101 
0102 if numel(model.comps)>1
0103     EM='The model has several compartments. All compartments will be merged';
0104     dispEM(EM,false);
0105     model=mergeCompartments(model,true,true);
0106 end
0107 
0108 noGenes = ismember(model.genes,GSS.genes);
0109 if ~all(noGenes)
0110     EM=['For ' num2str(numel(find(~noGenes))) ' of ' num2str(numel(model.genes)) ' model genes no data was found in GSS'];
0111     dispEM(EM,false);
0112 end
0113 
0114 %***Begin formating the data structures
0115 
0116 %Expand the model so that iso-enzymes have different reactions
0117 model=expandModel(model);
0118 
0119 %Identify reactions that have to be deleted because the involved mets are
0120 %never produced. This is done in an iterative manner
0121 removedRxns={};
0122 %This is to keep track of which metabolites are removed in this step. It is
0123 %needed to adjust the transport costs
0124 originalModelMets=model.mets;
0125 while 1
0126     irrevModel=convertToIrrev(model);
0127     
0128     I=sum(irrevModel.S>0,2);
0129     
0130     %Pretend that the unconstrained metabolites are made enough
0131     if isfield(irrevModel,'unconstrained')
0132         I(irrevModel.unconstrained~=0)=2;
0133     end
0134     metsToDelete=false(numel(model.mets),1);
0135     
0136     %This is not very neat but iterate through each metabolite and check
0137     %whether it can be produced (without using only one isolated reversible
0138     %reaction)
0139     for i=1:numel(irrevModel.mets)
0140         %If something can be made in two reactions then everything is fine.
0141         %If it can be made in one reaction it is fine unless it is through
0142         %an isolated reversible reaction (which can act as a mini loop)
0143         if I(i)<2
0144             if I(i)==1
0145                 %Find the reaction where this metabolite is produced
0146                 [~, J]=find(irrevModel.S(i,:)>0);
0147                 
0148                 %Check the metabolites that are consumed in this reaction.
0149                 %The problem is if any of them is only produced in the
0150                 %opposite reversible reaction
0151                 K=irrevModel.S(:,J)<0;
0152                 check=find(K & I<=1);
0153                 
0154                 for j=1:numel(check)
0155                     %Find the reactions where it participates
0156                     [~, L]=find(irrevModel.S(check(j),:)>0);
0157                     
0158                     if ~isempty(L)
0159                         rxn=irrevModel.rxns(J);
0160                         rxnRev=irrevModel.rxns(L);
0161                         if strcmp(strrep(rxn,'_REV',''),strrep(rxnRev,'_REV',''))
0162                             metsToDelete(i)=true;
0163                         end
0164                     else
0165                         %If the metabolite was never produced then do
0166                         %nothing and deal with it when the loop gets there
0167                         continue;
0168                     end
0169                 end
0170             else
0171                 %Not made anywhere
0172                 metsToDelete(i)=true;
0173             end
0174         end
0175     end
0176     
0177     if any(metsToDelete)
0178         %Delete any reactions involving any of the metsToDelete
0179         [~, I]=find(model.S(metsToDelete,:));
0180         removedRxns=[removedRxns;model.rxns(I)];
0181         model=removeReactions(model,I,true,true);
0182     else
0183         %All bad reactions deleted
0184         break;
0185     end
0186 end
0187 
0188 %Adjust the transport costs
0189 transportCost=transportCost(ismember(originalModelMets,model.mets));
0190 
0191 %Assign fake genes to reactions without genes. This is just to make things
0192 %easier later on
0193 I=find(sum(model.rxnGeneMat,2)==0);
0194 for i=1:numel(I)
0195     model.genes=[model.genes;['&&FAKE&&' num2str(i)]];
0196     if isfield(model,'geneShortNames')
0197         model.geneShortNames=[model.geneShortNames;{''}];
0198     end
0199     if isfield(model,'geneMiriams')
0200         model.geneMiriams=[model.geneMiriams;{[]}];
0201     end
0202     if isfield(model,'proteins')
0203         model.proteins=[model.proteins;{[]}];
0204     end
0205     if isfield(model,'geneFrom')
0206         model.geneFrom=[model.geneFrom;{{'FAKE'}}];
0207     end
0208     model.rxnGeneMat(I(i),numel(model.genes))=1;
0209     model.grRules{I(i)}='';
0210 end
0211 
0212 %Update the GSS. All genes, fake or real, for which there is no evidence
0213 %gets a score 0.5 in all compartments. Also just to make it easier further
0214 %on
0215 I=setdiff(model.genes,GSS.genes);
0216 GSS.genes=[GSS.genes;I];
0217 GSS.scores=[GSS.scores;ones(numel(I),numel(GSS.compartments))*0.5];
0218 
0219 %Gene complexes should be moved together in order to be biologically
0220 %relevant. The average score for the genes is used for each compartment.
0221 %This is done by changing the model so that gene complexes are used as a
0222 %single gene name and then a score is calculated for that "gene".
0223 
0224 %Only "and"-relationships exist after expandModel
0225 genes=unique(model.grRules);
0226 nGenes=strrep(genes,'(','');
0227 nGenes=strrep(nGenes,')','');
0228 %nGenes=strrep(nGenes,' and ','_and_');
0229 complexes=setdiff(nGenes,model.genes);
0230 if ~isempty(complexes)
0231     if isempty(complexes{1}) %Empty grRules also come up here
0232         complexes(1)=[];
0233     end
0234 end
0235 cScores=zeros(numel(complexes),numel(GSS.compartments));
0236 for i=1:numel(complexes)
0237     genesInComplex=regexp(complexes{i},' and ','split');
0238     
0239     %Find these genes in GSS
0240     [I, J]=ismember(genesInComplex,GSS.genes);
0241     
0242     if any(I)
0243         %Get the average of the genes that were found
0244         mScores=mean(GSS.scores(J(I),:));
0245         
0246         %And add 0.5 for the genes that were not found in order to be
0247         %consistent with non-complexes
0248         mScores=(mScores.*sum(I)+(numel(genesInComplex)-sum(I))*0.5)/numel(genesInComplex);
0249     else
0250         EM=['Could not parse grRule "' complexes{i} '". Assigning score 0.0 in all compartments'];
0251         dispEM(EM,false);
0252         mScores=ones(1,numel(genesInComplex))*0.5;
0253     end
0254     cScores(i,:)=mScores;
0255     
0256     %Add this complex as a new gene
0257     model.genes=[model.genes;complexes{i}];
0258     if isfield(model,'geneMiriams')
0259         model.geneMiriams=[model.geneMiriams;{[]}];
0260     end
0261     if isfield(model,'geneShortNames')
0262         model.geneShortNames=[model.geneShortNames;{''}];
0263     end
0264     if isfield(model,'proteins')
0265         model.proteins=[model.proteins;{''}];
0266     end
0267     if isfield(model,'geneFrom')
0268         model.geneFrom=[model.geneFrom;{'COMPLEX'}];
0269     end
0270     %Find the reactions which had the original complex and change them to
0271     %use the new "gene"
0272     I=ismember(model.grRules,['(' complexes{i} ')']);
0273     
0274     %Should check more carefully if there can be an error here
0275     if ~isempty(I)
0276         model.rxnGeneMat(I,:)=0; %Ok since the split on "or" was applied
0277         model.rxnGeneMat(I,numel(model.genes))=1;
0278     end
0279 end
0280 
0281 %Add the new "genes"
0282 GSS.genes=[GSS.genes;complexes];
0283 GSS.scores=[GSS.scores;cScores];
0284 
0285 %After merging the complexes it could happen that there are genes that are
0286 %no longer in use. Delete such genes
0287 model=removeReactions(model,{},false,true);
0288 
0289 %Exchange reactions, defined as involving an unconstrained metabolite, are
0290 %special in that they have to stay in the defaultCompartment. This means
0291 %that uptake/excretion of metabolites is always via the default
0292 %compartment. This is a small simplification, but should be valid in most
0293 %cases
0294 [~, I]=getExchangeRxns(model);
0295 
0296 %It will be easier later on if the same place. Put them in the beginning
0297 J=1:numel(model.rxns);
0298 J(I)=[];
0299 model=permuteModel(model,[I;J'],'rxns');
0300 
0301 %Number of exchange reactions
0302 nER=numel(I);
0303 
0304 %Also put the exchange metabolites in the beginning
0305 if isfield(model,'unconstrained')
0306     I=find(model.unconstrained);
0307     J=1:numel(model.mets);
0308     J(I)=[];
0309     model=permuteModel(model,[I;J'],'mets');
0310     %Also reorder the transport costs
0311     transportCost=transportCost([I;J']);
0312     %Number of exchange metabolites
0313     nEM=numel(I);
0314 else
0315     nEM=0;
0316 end
0317 
0318 %There is no point of having genes for exchange reactions, so delete them.
0319 %Also to make computations easier
0320 model.rxnGeneMat(1:nER,:)=0;
0321 model.grRules(1:nER)={''};
0322 
0323 %Remove unused genes
0324 model=removeReactions(model,{},false,true);
0325 
0326 %Remove genes with no match to the model and reorder so that the genes are
0327 %in the same order as model.genes. Since the fake genes are already added
0328 %so that all genes in model exist in GSS it is fine to do like this
0329 [~, J]=ismember(model.genes,GSS.genes);
0330 GSS.genes=model.genes;
0331 GSS.scores=GSS.scores(J,:);
0332 
0333 %Reorder the GSS so that the first index corresponds to the default
0334 %compartment
0335 [~, J]=ismember(defaultCompartment,GSS.compartments);
0336 reorder=1:numel(GSS.compartments);
0337 reorder(J)=[];
0338 reorder=[J reorder];
0339 GSS.scores=GSS.scores(:,reorder);
0340 GSS.compartments=GSS.compartments(reorder);
0341 
0342 %Since it is only checked whether the metabolites can be synthesized, there
0343 %is no need to care about the stoichiometry. Change to -1/1 to simplify
0344 %later. Keep the S matrix for later though
0345 oldS=model.S;
0346 model.S(model.S>0)=1;
0347 model.S(model.S<0)=-1;
0348 
0349 %Here is a bit of a trick. To avoid the recurring calculation which
0350 %reactions are reversible, the reversible reactions have the coefficients
0351 %-10/10 instead of -1/1
0352 model.S(:,model.rev==1)=model.S(:,model.rev==1).*10;
0353 
0354 %***Begin problem formulation
0355 
0356 %Some numbers that are good to have
0357 nRxns=numel(model.rxns)-nER; %Excluding exchange rxns
0358 nMets=numel(model.mets)-nEM; %Excluding exchange mets
0359 nGenes=numel(model.genes);
0360 nComps=numel(GSS.compartments);
0361 
0362 %Create a big stoichiometric matrix that will be the current model. In
0363 %order to have faster simulations the maximal model size is declared and
0364 %reactions are then moved within it.
0365 
0366 %First the original model (with the first nE being exchange rxns), then
0367 %reserve space for number of rxns minus exchange rxns for each non-default
0368 %compartment, then transport reactions for all non-exchange mets between
0369 %the default compartment and all others.
0370 %NOTE: Kept eye()*0 since eye() can be used to include all transport from
0371 %the beginning
0372 s=repmat(eye(nMets)*0,1,nComps-1);
0373 s=[zeros(numel(model.mets)-nMets,size(s,2));s];
0374 S=[model.S sparse(numel(model.mets),nRxns*(nComps-1)) s];
0375 s=[sparse(nMets*(nComps-1),numel(model.rxns)+nRxns*(nComps-1)) eye(nMets*(nComps-1))*0];
0376 S=[S;s];
0377 
0378 %Also replicate the transport costs
0379 transportCost=[transportCost(1:nEM);repmat(transportCost(nEM+1:end),nComps,1)];
0380 
0381 %Create a binary matrix that says where the genes are in the current
0382 %solution
0383 g2c=false(nGenes,nComps);
0384 %All genes start in the default compartment
0385 g2c(:,1)=true;
0386 
0387 %Start of main optimization loop
0388 tic;
0389 bestScore=-inf;
0390 bestS=[];
0391 bestg2c=[];
0392 
0393 %Temp for testing
0394 plotScore=[];
0395 nTrans=[];
0396 totScore=[];
0397 minScore=sum(min(GSS.scores,[],2));
0398 maxScore=sum(max(GSS.scores,[],2));
0399 
0400 while toc<maxTime*60
0401     %Pick a random gene, weighted by it is current score minus the best
0402     %score for that gene (often 1.0, but can be 0.5 for no genes or average
0403     %for complexes). Genes with bad fits are more likely to be moved. This
0404     %formulation never moves a gene from its best compartment. Therefore a
0405     %small uniform weight is added
0406     [I, J]=find(g2c);
0407     geneToMove=randsample(nGenes,1,true,max(GSS.scores(I,:),[],2)-GSS.scores(sub2ind(size(g2c),I,J))+0.1);
0408     
0409     %Sample among possible compartments to move to. Add a larger weight to
0410     %even out the odds a little. Also a way of getting rid of loops where
0411     %the same set of genes are moved back and forth several times
0412     toComp=randsample(nComps,1,true,GSS.scores(geneToMove,:)+0.2);
0413     
0414     %Check that it moves to a new compartment
0415     if toComp==find(g2c(geneToMove,:))
0416         continue;
0417     end
0418     
0419     %Moves the gene
0420     [newS, newg2c]=moveGene(S,model,g2c,geneToMove,toComp,nRxns,nMets);
0421     
0422     %Tries to connect the network. If this was not possible in 10
0423     %iterations, then abort. If more than 20 modifications were needed then
0424     %it is unlikely that it will be a lower score
0425     wasConnected=false;
0426     for j=1:10
0427         %Find the metabolites that are now unconnected
0428         unconnected=findUnconnected(newS,nEM);
0429         
0430         %Continue if there are still unconnected
0431         if any(unconnected)
0432             %For each gene find out how many of these could be connected if
0433             %the gene was moved and how many would be disconnected by
0434             %moving that gene
0435             [geneIndex, moveTo, deltaConnected, deltaScore]=selectGenes(newS,nEM,nMets,nER,nRxns,model,unconnected,g2c,GSS);
0436             
0437             %Score which gene would be the best to move. The highest
0438             %deltaScore is 1.0. It should be possible to move a gene from
0439             %worst to best compartment even if it disconnects, say, 1.5
0440             %more metabolites
0441             [score, I]=max(1.5*deltaScore+deltaConnected);
0442             
0443             %Check if it has to add a transport or if there is a gene that
0444             %could be moved order to have a more connected network
0445             hasToAddTransport=true;
0446             if ~isempty(deltaConnected)
0447                 if score>0
0448                     hasToAddTransport=false;
0449                 end
0450             end
0451             
0452             %If it is possible to move any gene in order to have a more
0453             %connected network, then move the best one
0454             if hasToAddTransport==false
0455                 [newS, newg2c]=moveGene(newS,model,g2c,geneIndex(I),moveTo(I),nRxns,nMets);
0456             else
0457                 %Choose a random unconnected metabolite that should be
0458                 %connected
0459                 transMet=unconnected(randsample(numel(unconnected),1));
0460                 
0461                 %First get where the metabolite is now
0462                 comps=ceil((transMet-nEM)/((size(S,1)-nEM)/nComps));
0463                 
0464                 %Find the corresponding metabolite index if it were in the
0465                 %default compartment
0466                 dcIndex=transMet-(comps-1)*nMets;
0467                 
0468                 %Then get the indexes of that metabolite in all
0469                 %compartments
0470                 allIndexes=dcIndex;
0471                 for k=1:nComps-1
0472                     allIndexes=[allIndexes;dcIndex+nMets*k];
0473                 end
0474                 
0475                 %It could be that some of these are not used in any
0476                 %reaction. Get only the ones which are
0477                 I=sum(newS(allIndexes,:)~=0,2)>0;
0478                 
0479                 %Then get the ones that are used but not in unconnected.
0480                 %These are metabolites that could potentially be
0481                 %transported to connect transMet
0482                 connectedUsed=setdiff(allIndexes(I),unconnected);
0483                 
0484                 %This may be an error but leave it for now. It seems to
0485                 %happen if nothing can be connected in one step
0486                 if isempty(connectedUsed)
0487                     break;
0488                 end
0489                 
0490                 %If transMet is in the default compartment then everything
0491                 %is fine, just connect it to a random one
0492                 if transMet==dcIndex
0493                     newS=addTransport(newS,nRxns,nER,nMets,nEM,nComps,transMet,connectedUsed(randsample(numel(connectedUsed),1)));
0494                 else
0495                     %If one of the connectedUsed is in the default
0496                     %compartment then connect to that one
0497                     I=connectedUsed(connectedUsed<(nMets+nEM));
0498                     if any(I)
0499                         newS=addTransport(newS,nRxns,nER,nMets,nEM,nComps,transMet,I(randsample(numel(I),1)));
0500                     else
0501                         %This is if the only way to connect it is by adding
0502                         %two transport reactions, going via the default
0503                         %compartment
0504                         break;
0505                     end
0506                 end
0507             end
0508         else
0509             wasConnected=true;
0510             break;
0511         end
0512     end
0513     
0514     %If the network was connected in a new way, it is possible that some
0515     %transport reactions are no longer needed. They should be removed
0516     if wasConnected==true
0517         %These are the metabolites that are being transported
0518         activeTransport=find(sum(newS(:,nER+nRxns*nComps+1:end),2));
0519         
0520         %Get the metabolites that are unconnected if transport was not used
0521         unconnected=findUnconnected(newS(:,1:nER+nRxns*nComps),nEM);
0522         
0523         %Find the transport reactions that are not needed and delete them
0524         I=setdiff(activeTransport,unconnected);
0525         
0526         %Since both metabolites in a transport rxns must be connected for
0527         %the reaction to be deleted, the sum over the colums should be 4
0528         newS(:,find(sum(newS(I,nER+nRxns*nComps+1:end))==4)+nER+nRxns*nComps)=0;
0529         
0530         %Score the solution and determine whether to keep it as a new
0531         %solution
0532         [score, geneScore, trCost]=scoreModel(newS,newg2c,GSS,transportCost);
0533         
0534         %If it was the best solution so far, keep it
0535         if score>bestScore
0536             bestScore=score;
0537             bestS=newS;
0538             bestg2c=newg2c;
0539         end
0540         
0541         %This should not be steepest descent later
0542         if score>=bestScore% || exp((score-bestScore)*7)>rand()
0543             plotScore=[plotScore;geneScore];
0544             nTrans=[nTrans;trCost];
0545             totScore=[totScore;score];
0546             S=newS;
0547             g2c=newg2c;
0548             
0549             if plotResults==true
0550                 subplot(3,2,1);
0551                 spy(S);
0552                 subplot(3,2,2);
0553                 plot(plotScore,'r');
0554                 xlabel('Gene score');
0555                 subplot(3,2,3);
0556                 plot((plotScore-minScore)/(maxScore-minScore),'r');
0557                 xlabel('Gene score relative to predictions');
0558                 subplot(3,2,4);
0559                 plot(nTrans,'g');
0560                 xlabel('Transport cost');
0561                 subplot(3,2,5);
0562                 plot(totScore,'b');
0563                 xlabel('Total score');
0564                 subplot(3,2,6);
0565                 pause(0.2);
0566             end
0567         end
0568     end
0569 end
0570 scores.totScore=score;
0571 scores.geneScore=geneScore;
0572 scores.transCost=trCost;
0573 
0574 %Find which metabolites are transported and to where
0575 [I, J]=find(bestS(nEM+1:nEM+nMets,end-nMets*(nComps-1)+1:end));
0576 J=ceil(J/nMets+1);
0577 transportStruct.mets=model.metNames(I+nEM);
0578 transportStruct.toComp=GSS.compartments(J);
0579 
0580 [I, J]=find(bestg2c);
0581 geneLocalization.genes=GSS.genes(I);
0582 geneLocalization.comps=GSS.compartments(J);
0583 
0584 %Resort the gene names
0585 [~, I]=sort(geneLocalization.genes);
0586 geneLocalization.genes=geneLocalization.genes(I);
0587 geneLocalization.comps=geneLocalization.comps(I);
0588 
0589 %Remove the fake genes
0590 I=strncmp('&&FAKE&&',geneLocalization.genes,8);
0591 geneLocalization.genes(I)=[];
0592 geneLocalization.comps(I)=[];
0593 
0594 %Put together the model. This is done by first duplicating the S matrix
0595 %into the different compartments. Then the transport reactions are added
0596 %based on transportStruct. By now model.S should have the same size as the
0597 %S matrix used in the optimization, but with conserved stoichiometry. In
0598 %the final step all reactions and metabolites that are not used in the S
0599 %matrix from the optimization are deleted from the model
0600 outModel=model;
0601 outModel.S=oldS;
0602 
0603 %This is the S matrix without exchange rxns or metabolites
0604 copyPart=outModel.S(nEM+1:end,nER+1:end);
0605 
0606 %Replicate to give the rxnGeneMat for the full system
0607 copyRxnGeneMat=outModel.rxnGeneMat(nER+1:end,:);
0608 outModel.rxnGeneMat=[outModel.rxnGeneMat;repmat(copyRxnGeneMat,nComps-1,1)];
0609 
0610 %First fix the compartments. The model is already ordered with the exchange
0611 %metabolites first. The original model may contain one or two compartments,
0612 %depending on whether any exchange metabolites are defined
0613 nStartComps=numel(outModel.comps);
0614 if nStartComps==1
0615     outModel.comps={'1'};
0616     outModel.compNames=GSS.compartments(1);
0617 else
0618     if model.metComps(1)==1
0619         outModel.compNames(1)=GSS.compartments(1);
0620     else
0621         outModel.compNames(2)=GSS.compartments(1);
0622     end
0623 end
0624 outModel.compNames=[outModel.compNames;GSS.compartments(2:end)'];
0625 
0626 %Ugly little loop
0627 for i=1:numel(GSS.compartments)-1
0628     outModel.comps=[outModel.comps;num2str(numel(outModel.comps)+1)];
0629 end
0630 %This information is not known from the data, so empty fields are added
0631 outModel.compOutside=cell(numel(outModel.comps),1);
0632 outModel.compOutside(:)={''};
0633 
0634 for i=1:nComps-1
0635     outModel.S=[outModel.S sparse(size(outModel.S,1),nRxns)];
0636     outModel.S=[outModel.S;[sparse(nMets,nRxns*i+nER) copyPart]];
0637     outModel.rxns=[outModel.rxns;strcat(outModel.rxns(nER+1:nER+nRxns),'_',GSS.compartments{i+1})];
0638     outModel.rxnNames=[outModel.rxnNames;strcat(outModel.rxnNames(nER+1:nER+nRxns),' (',GSS.compartments{i+1},')')];
0639     outModel.lb=[outModel.lb;outModel.lb(nER+1:nER+nRxns)];
0640     outModel.ub=[outModel.ub;outModel.ub(nER+1:nER+nRxns)];
0641     outModel.rev=[outModel.rev;outModel.rev(nER+1:nER+nRxns)];
0642     outModel.c=[outModel.c;outModel.c(nER+1:nER+nRxns)];
0643     if isfield(outModel,'grRules')
0644         outModel.grRules=[outModel.grRules;outModel.grRules(nER+1:nER+nRxns)];
0645     end
0646     if isfield(outModel,'subSystems')
0647         outModel.subSystems=[outModel.subSystems;outModel.subSystems(nER+1:nER+nRxns)];
0648     end
0649     if isfield(outModel,'eccodes')
0650         outModel.eccodes=[outModel.eccodes;outModel.eccodes(nER+1:nER+nRxns)];
0651     end
0652     if isfield(outModel,'rxnFrom')
0653         outModel.rxnFrom=[outModel.rxnFrom;outModel.rxnFrom(nER+1:nER+nRxns)];
0654     end
0655     if isfield(outModel,'rxnMiriams')
0656         outModel.rxnMiriams=[outModel.rxnMiriams;outModel.rxnMiriams(nER+1:nER+nRxns)];
0657     end
0658     if isfield(outModel,'rxnNotes')
0659         outModel.rxnNotes=[outModel.rxnNotes;outModel.rxnNotes(nER+1:nER+nRxns)];
0660     end
0661     if isfield(outModel,'rxnReferences')
0662         outModel.rxnReferences=[outModel.rxnReferences;outModel.rxnReferences(nER+1:nER+nRxns)];
0663     end
0664     if isfield(outModel,'rxnConfidenceScores')
0665         outModel.rxnConfidenceScores=[outModel.rxnConfidenceScores;outModel.rxnConfidenceScores(nER+1:nER+nRxns)];
0666     end
0667     if isfield(outModel,'rxnDeltaG')
0668         outModel.rxnDeltaG=[outModel.rxnDeltaG;outModel.rxnDeltaG(nER+1:nER+nRxns)];
0669     end
0670     outModel.mets=[outModel.mets;strcat(outModel.mets(nEM+1:nEM+nMets),'_',GSS.compartments{i+1})];
0671     outModel.metNames=[outModel.metNames;outModel.metNames(nEM+1:nEM+nMets)];
0672     outModel.b=[outModel.b;outModel.b(nEM+1:nEM+nMets,:)];
0673     I=ones(nMets,1)*nStartComps+i;
0674     outModel.metComps=[outModel.metComps;I];
0675     if isfield(outModel,'inchis')
0676         outModel.inchis=[outModel.inchis;outModel.inchis(nEM+1:nEM+nMets)];
0677     end
0678     if isfield(outModel,'metSmiles')
0679         outModel.metSmiles=[outModel.metSmiles;outModel.metSmiles(nEM+1:nEM+nMets)];
0680     end
0681     if isfield(outModel,'unconstrained')
0682         outModel.unconstrained=[outModel.unconstrained;outModel.unconstrained(nEM+1:nEM+nMets)];
0683     end
0684     if isfield(outModel,'metMiriams')
0685         outModel.metMiriams=[outModel.metMiriams;outModel.metMiriams(nEM+1:nEM+nMets)];
0686     end
0687     if isfield(outModel,'metFormulas')
0688         outModel.metFormulas=[outModel.metFormulas;outModel.metFormulas(nEM+1:nEM+nMets)];
0689     end
0690     if isfield(outModel,'metFrom')
0691         outModel.metFrom=[outModel.metFrom;outModel.metFrom(nEM+1:nEM+nMets)];
0692     end
0693     if isfield(outModel,'metCharges')
0694         outModel.metCharges=[outModel.metCharges;outModel.metCharges(nEM+1:nEM+nMets)];
0695     end
0696     if isfield(outModel,'metDeltaG')
0697         outModel.metDeltaG=[outModel.metDeltaG;outModel.metDeltaG(nEM+1:nEM+nMets)];
0698     end
0699 end
0700 
0701 %Add the transport reactions
0702 transS=bestS(:,numel(outModel.rxns)+1:end);
0703 J=sum(transS)>0; %Active rxns
0704 
0705 %Transport reactions are written in a different way compared to a "real"
0706 %stoichimetric matrix. This is to fix that
0707 transS(transS~=0)=1;
0708 transS(1:nEM+nMets,:)=transS(1:nEM+nMets,:)*-1;
0709 I=find(sum(transS>0,2));
0710 nTransRxns=numel(I);
0711 outModel.S=[outModel.S transS(:,J)];
0712 filler=ones(nTransRxns,1);
0713 outModel.lb=[outModel.lb;filler*-1000];
0714 outModel.ub=[outModel.ub;filler*1000];
0715 outModel.rev=[outModel.rev;filler];
0716 outModel.c=[outModel.c;filler*0];
0717 outModel.rxnGeneMat=[outModel.rxnGeneMat;sparse(nTransRxns,numel(outModel.genes))];
0718 
0719 for i=1:numel(I)
0720     outModel.rxns=[outModel.rxns;strcat('transport',num2str(i))];
0721     outModel.rxnNames=[outModel.rxnNames;['Transport of ',outModel.metNames{I(i)}]];
0722     if isfield(outModel,'grRules')
0723         outModel.grRules=[outModel.grRules;{''}];
0724     end
0725     if isfield(outModel,'rxnMiriams')
0726         outModel.rxnMiriams=[outModel.rxnMiriams;{[]}];
0727     end
0728     if isfield(outModel,'subSystems')
0729         outModel.subSystems=[outModel.subSystems;{{'Inferred transport reactions'}}];
0730     end
0731     if isfield(outModel,'eccodes')
0732         outModel.eccodes=[outModel.eccodes;{''}];
0733     end
0734     if isfield(outModel,'rxnFrom')
0735         outModel.rxnFrom=[outModel.rxnFrom;{''}];
0736     end
0737     if isfield(outModel,'rxnNotes')
0738         outModel.rxnNotes=[outModel.rxnNotes;{''}];
0739     end
0740     if isfield(outModel,'rxnReferences')
0741         outModel.rxnReferences=[outModel.rxnReferences;{''}];
0742     end
0743     if isfield(outModel,'rxnConfidenceScores')
0744         outModel.rxnConfidenceScores=[outModel.rxnConfidenceScores;NaN];
0745     end
0746     if isfield(outModel,'rxnDeltaG')
0747         outModel.rxnDeltaG=[outModel.rxnDeltaG;NaN];
0748     end
0749 end
0750 
0751 %Then remove all reactions and metabolites that aren't used in the final
0752 %solution from the optimization
0753 [~, J]=find(bestS(:,1:nER+nComps*nRxns));
0754 K=true(numel(outModel.rxns),1);
0755 K(J)=false;
0756 K(end-nTransRxns+1:end)=false;
0757 outModel=removeReactions(outModel,K,true);
0758 
0759 %Remove all fake genes
0760 I=strncmp('&&FAKE&&',outModel.genes,8);
0761 outModel.genes(I)=[];
0762 if isfield(outModel,'geneMiriams')
0763     outModel.geneMiriams(I)=[];
0764 end
0765 if isfield(outModel,'geneShortNames')
0766     outModel.geneShortNames(I)=[];
0767 end
0768 if isfield(outModel,'proteins')
0769     outModel.proteins(I)=[];
0770 end
0771 outModel.rxnGeneMat(:,I)=[];
0772 
0773 %Fix grRules and reconstruct rxnGeneMat
0774 [grRules,rxnGeneMat] = standardizeGrRules(outModel,true);
0775 outModel.grRules = grRules;
0776 outModel.rxnGeneMat = rxnGeneMat;
0777 end
0778 
0779 %Moves a gene and all associated reactions from one compartment to another
0780 function [S, g2c]=moveGene(S,model,g2c,geneToMove,toComp,nRxns,nMets)
0781 %Find the current compartment and update to the new one
0782 currentComp=find(g2c(geneToMove,:));
0783 g2c(geneToMove,:)=false;
0784 g2c(geneToMove,toComp)=true;
0785 
0786 %Find the reactions in the original model that the gene controls
0787 [I, ~]=find(model.rxnGeneMat(:,geneToMove));
0788 
0789 %Calculate their current positions in the S matrix
0790 oldRxns=I+(currentComp-1)*nRxns;
0791 
0792 %And their new positions
0793 newRxns=I+(toComp-1)*nRxns;
0794 
0795 %The metabolite ids also have to be changed in order to match the new
0796 %compartment
0797 metChange=nMets*(toComp-currentComp);
0798 
0799 %Update the reactions
0800 [I, J, K]=find(S(:,oldRxns));
0801 I=I+metChange;
0802 
0803 %Move the reactions
0804 S(:,oldRxns)=0;
0805 S(sub2ind(size(S),I,newRxns(J)))=K;
0806 end
0807 
0808 %Finds which metabolites are unconnected, in the sense that they are never
0809 %a product or only a product in a reversible reaction where one reactant is
0810 %only a product in the opposite direction of that reaction. This function
0811 %ignores exchange metabolites. Returns a vector of metabolite indexes.
0812 %metsToCheck is an array of metabolite indexes to check for connectivity.
0813 %If not supplied then all metabolites are checked
0814 function unconnected=findUnconnected(S,nEM,metsToCheck)
0815 if nargin>2
0816     %Do this by deleting everything from the network that is not in
0817     %metsToCheck and that is not exchange metabolites
0818     I=false(size(S,1),1);
0819     I(1:nEM)=true;
0820     I(metsToCheck)=true;
0821     S=S(I,:);
0822 end
0823 
0824 em=false(size(S,1),1);
0825 em(1:nEM)=true;
0826 
0827 %Construct a matrix in which the reversible reactions are inverted
0828 I=sum(S>2,1) | sum(S>2,1);
0829 revS=S;
0830 revS(:,I)=revS(:,I)*-1;
0831 
0832 %First calculate the ones that are ok
0833 %Produced in 2 rxns, is exchange, is not used at all, is produced in
0834 %non-reversible, involved in more than 1 reversible reactions
0835 connected=sum(S>0,2)>1 | em | sum(S~=0,2)==0 | sum(S(:,~I)>0,2)>0 | sum(S(:,I)~=0,2)>1;
0836 
0837 %Then get the ones that are unconnected because they are never produced
0838 unconnected=sum(S>0 | revS>0,2)==0 & connected==false;
0839 
0840 %Then get the ones that are potentially unconnected
0841 maybeUnconnected=~connected & ~unconnected;
0842 %maybeUnconnected=find(maybeUnconnectedS);
0843 
0844 %The metabolites in maybeUnconnected are involved in one reversible
0845 %reaction and not produced in any other reaction. This means that the
0846 %reactions which have at least one met in maybeUnconnected as reactant and
0847 %one as product are unconnected. The metabolites in maybeUnconnected that
0848 %are present in those reactions are then dead ends
0849 deadRxns=any(S(maybeUnconnected,:)>0) & any(S(maybeUnconnected,:)<0);
0850 
0851 %Get the mets involved in any of those reactions
0852 problematic=any(S(:,deadRxns)~=0,2);
0853 
0854 %If any of these are in the maybeUnconnected list then the metabolite is
0855 %unconnected
0856 unconnected(problematic & maybeUnconnected)=true;
0857 
0858 %Map back to metsToCheck
0859 if nargin>2
0860     unconnected=metsToCheck(unconnected(nEM+1:end));
0861 else
0862     unconnected=find(unconnected);
0863 end
0864 end
0865 
0866 %Given a set of unconnected metabolites, this function tries to move each
0867 %gene that could connect any of them, calculates the number of newly
0868 %connected metabolites minus the number of newly disconnected metabolites.
0869 %As some metabolites are very connected, only 25 genes are checked. Genes
0870 %that have a low score in their current compartment are more likely to be
0871 %moved
0872 function [geneIndex, moveTo, deltaConnected, deltaScore]=selectGenes(S,nEM,nMets,nER,nRxns,model,unconnected,g2c,GSS)
0873 %If moveTo is 0 then the gene cannot connect any of the metabolites
0874 moveTo=zeros(numel(model.genes),1);
0875 deltaConnected=zeros(numel(model.genes),1);
0876 
0877 %First get where the metabolites are now
0878 nComps=size(g2c,2);
0879 comps=ceil((unconnected-nEM)/((size(S,1)-nEM)/nComps));
0880 
0881 %Find the corresponding metabolite indexes if they all were in the default
0882 %compartment
0883 dcIndexes=unique(unconnected-(comps-1)*nMets);
0884 
0885 %Then find them if they were in any other compartment
0886 allIndexes=dcIndexes;
0887 for i=1:nComps-1
0888     allIndexes=[allIndexes;dcIndexes+nMets*i];
0889 end
0890 
0891 %Also check which reversible reactions that could be used
0892 I=sum(S>2,1) | sum(S>2,1);
0893 revS=S;
0894 revS(:,I)=revS(:,I)*-1;
0895 
0896 %Find all reactions that could make any of the unconnected metabolites in
0897 %some other compartment
0898 newMets=setdiff(allIndexes,unconnected);
0899 [~, potential]=find(S(newMets,:)>0 | revS(newMets,:)>0);
0900 potential(potential<=nER | potential>nER+nRxns*nComps)=[]; %No exchange rxns or transport rxns
0901 
0902 %Map J to the real metabolic reactions in model
0903 rxnComps=ceil((potential-nER)/(nRxns));
0904 
0905 %Find the corresponding reaction indexes if they all were in the default
0906 %compartment
0907 dcRxnIndexes=potential-(rxnComps-1)*nRxns;
0908 
0909 %Get the genes for those reactions
0910 genes=find(sum(model.rxnGeneMat(dcRxnIndexes,:)>0,1));
0911 
0912 %For some cases there can be very many reactions to connect something. This
0913 %is in particular true in the beginning of the optimization if, say, ATP is
0914 %unconnected. Therefore limit the number of genes to be checked to 25.
0915 %Weigh so that genes with bad scores in their current compartment are more
0916 %likely to be moved.
0917 
0918 %Get scores for these genes
0919 [~, J]=find(g2c(genes,:));
0920 
0921 %Add a small weight so that genes in their best compartment could be moved
0922 %as well
0923 geneScores=GSS.scores(sub2ind(size(g2c),genes(:),J));
0924 modGeneScores=1.1-geneScores;
0925 if numel(genes)>25
0926     rGenes=genes(randsample(numel(genes),min(numel(genes),25),true,modGeneScores));
0927     
0928     %The sampling with weights could give duplicates
0929     rGenes=unique(rGenes);
0930     
0931     %Reorder the geneScores to match
0932     [~, I]=ismember(rGenes,genes);
0933     geneScores=geneScores(I);
0934     genes=rGenes;
0935 end
0936 for i=1:numel(genes)
0937     %Since one gene is moved at a time, only metabolites involved in any of
0938     %the reactions for that gene can become unconnected. This helps to
0939     %speed up the algorithm. First get all involved reactions in the
0940     %default compartment
0941     rxns=find(model.rxnGeneMat(:,genes(i)));
0942     
0943     %Then get their mets
0944     mets=find(sum(model.S(:,rxns)~=0,2)>0);
0945     
0946     %Then get their indexes in all compartments
0947     allIndexes=mets;
0948     for j=1:nComps-1
0949         allIndexes=[allIndexes;mets+nMets*j];
0950     end
0951     
0952     %Check which of the unconnected metabolites that these reactions
0953     %correspond to. This could have been done earlier, but it is fast. The
0954     %reversibility check is skipped because it is unlikely to be an issue
0955     %here. Worst case is that the gene is tested once to much
0956     [I, ~]=find(model.S(:,rxns));
0957     moveToComps=unique(comps(ismember(dcIndexes,I)));
0958     
0959     %Try to move the gene to each of the compartments
0960     bestMove=-inf;
0961     bestComp=[];
0962     for j=1:numel(moveToComps)
0963         newS=moveGene(S,model,g2c,genes(i),moveToComps(j),nRxns,nMets);
0964         
0965         %Check how many metabolites that are unconnected after moving the
0966         %gene
0967         dConnected=numel(unconnected)-numel(findUnconnected(newS,nEM,[allIndexes;unconnected]));
0968         if dConnected>bestMove
0969             bestMove=dConnected;
0970             bestComp=moveToComps(j);
0971         end
0972     end
0973     
0974     %Add the difference in connectivity and where the genes should be moved
0975     moveTo(genes(i))=bestComp;
0976     deltaConnected(genes(i))=bestMove;
0977 end
0978 
0979 %Finish up
0980 geneIndex=genes(:);
0981 moveTo=moveTo(geneIndex);
0982 deltaConnected=deltaConnected(geneIndex);
0983 deltaScore=GSS.scores(sub2ind(size(g2c),geneIndex(:),moveTo))-geneScores;
0984 end
0985 
0986 %Small function to add a transport reactions between two metabolites.
0987 %Transport reactions are written as having a coefficient 2.0 for both
0988 %reactant and product. This is not a "real" reaction, but since all normal
0989 %reactions have coefficient -1/1 or -10/10 it is a compact way of writing
0990 %it
0991 function S=addTransport(S,nRxns,nER,nMets,nEM,nComps,metA,metB)
0992 mets=[metA;metB];
0993 %Find the current compartments for the metabolites
0994 comps=ceil((mets-nEM)/((size(S,1)-nEM)/nComps));
0995 
0996 if sum(comps==1)~=1
0997     EM='Tried to create a transport reaction from a non-default compartment';
0998     dispEM(EM);
0999 end
1000 
1001 %Calculate the reaction index
1002 rIndex=(nER+nRxns*nComps)+mets(comps~=1)-nEM-nMets;
1003 
1004 S(mets,rIndex)=2;
1005 end
1006 
1007 %Scores a network based on the localization of the genes and the number of
1008 %transporter reactions used
1009 function [score, geneScore, transportCost]=scoreModel(S,g2c,GSS,transportCost)
1010 [I, J]=find(g2c);
1011 geneScore=sum(GSS.scores(sub2ind(size(g2c),I,J)));
1012 [I, ~]=find(S==2);
1013 I=unique(I);
1014 transportCost=sum(transportCost(I));
1015 score=geneScore-transportCost;
1016 end
1017 
1018 % To avoid dependency on stats toolbox, use this alternative implementation
1019 % of randsample, source:
1020 % https://github.com/gpeyre/numerical-tours/blob/dacee30081c04ef5f67b26b387ead85f2b193af9/matlab/toolbox_signal/randsample.m
1021 function y = randsample(n, k, replace, w)
1022 %RANDSAMPLE Random sample, with or without replacement.
1023 %   Y = RANDSAMPLE(N,K) returns Y as a vector of K values sampled uniformly
1024 %   at random, without replacement, from the integers 1:N.
1025 %
1026 %   Y = RANDSAMPLE(POPULATION,K) returns K values sampled uniformly at
1027 %   random, without replacement, from the values in the vector POPULATION.
1028 %
1029 %   Y = RANDSAMPLE(...,REPLACE) returns a sample taken with replacement if
1030 %   REPLACE is true, or without replacement if REPLACE is false (the default).
1031 %
1032 %   Y = RANDSAMPLE(...,true,W) returns a weighted sample, using positive
1033 %   weights W, taken with replacement.  W is often a vector of probabilities.
1034 %   This function does not support weighted sampling without replacement.
1035 %
1036 %   Example:  Generate a random sequence of the characters ACGT, with
1037 %   replacement, according to specified probabilities.
1038 %
1039 %      R = randsample('ACGT',48,true,[0.15 0.35 0.35 0.15])
1040 %
1041 %   See also RAND, RANDPERM.
1042 
1043 %   Copyright 1993-2008 The MathWorks, Inc.
1044 %   $Revision: 1.1.4.3 $  $Date: 2008/12/01 08:09:34 $
1045 
1046 if nargin < 2
1047     error('stats:randsample:TooFewInputs','Requires two input arguments.');
1048 elseif numel(n) == 1
1049     population = [];
1050 else
1051     population = n;
1052     n = numel(population);
1053     if length(population)~=n
1054        error('stats:randsample:BadPopulation','POPULATION must be a vector.');
1055     end
1056 end
1057 
1058 if nargin < 3
1059     replace = false;
1060 end
1061 
1062 if nargin < 4
1063     w = [];
1064 elseif ~isempty(w)
1065     if length(w) ~= n
1066         if isempty(population)
1067             error('stats:randsample:InputSizeMismatch',...
1068                   'W must have length equal to N.');
1069         else
1070             error('stats:randsample:InputSizeMismatch',...
1071                   'W must have the same length as the population.');
1072         end
1073     else
1074         p = w(:)' / sum(w);
1075     end
1076 end
1077 
1078 switch replace
1079 
1080 % Sample with replacement
1081 case {true, 'true', 1}
1082     if isempty(w)
1083         y = ceil(n .* rand(k,1));
1084     else
1085         [dum, y] = histc(rand(k,1),[0 cumsum(p)]);
1086     end
1087 
1088 % Sample without replacement
1089 case {false, 'false', 0}
1090     if k > n
1091         if isempty(population)
1092             error('stats:randsample:SampleTooLarge',...
1093         'K must be less than or equal to N for sampling without replacement.');
1094         else
1095             error('stats:randsample:SampleTooLarge',...
1096                   'K must be less than or equal to the population size.');
1097         end
1098     end
1099 
1100     if isempty(w)
1101         % If the sample is a sizeable fraction of the population,
1102         % just randomize the whole population (which involves a full
1103         % sort of n random values), and take the first k.
1104         if 4*k > n
1105             rp = randperm(n);
1106             y = rp(1:k);
1107 
1108         % If the sample is a small fraction of the population, a full sort
1109         % is wasteful.  Repeatedly sample with replacement until there are
1110         % k unique values.
1111         else
1112             x = zeros(1,n); % flags
1113             sumx = 0;
1114             while sumx < k
1115                 x(ceil(n * rand(1,k-sumx))) = 1; % sample w/replacement
1116                 sumx = sum(x); % count how many unique elements so far
1117             end
1118             y = find(x > 0);
1119             y = y(randperm(k));
1120         end
1121     else
1122         error('stats:randsample:NoWeighting',...
1123               'Weighted sampling without replacement is not supported.');
1124     end
1125 otherwise
1126     error('stats:randsample:BadReplaceValue',...
1127           'REPLACE must be either true or false.');
1128 end
1129 
1130 if ~isempty(population)
1131     y = population(y);
1132 else
1133     y = y(:);
1134 end
1135 end
predictLocalization

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE