Home > external > kegg > getPhylDist.m

getPhylDist

PURPOSE ^

getPhylDist

SYNOPSIS ^

function phylDistStruct=getPhylDist(keggPath,onlyInKingdom)

DESCRIPTION ^

 getPhylDist
   Calculates distance between species in KEGG based on systematic name

   Input:
   keggPath        if keggPhylDist.mat is not in the RAVEN\external\kegg
                   directory, this function will attempt to read data from
                   a local FTP dump of the KEGG database. keggPath is the
                   path to the root of this database
   onlyInKingdom   if true, it generates a distance matrix with distance
                   Inf for organisms from another domains (Prokaryota,
                   Eukaryota) (optional, default false)

   Output:
   phylDistStruct  a structure with a list of organism ids and a matrix
                   that specifies their pairwise distances

   NOTE: This simple metric is based on the number of nodes two organisms
   are away from each other in KEGG

 Usage: phylDistStruct=getPhylDist(keggPath,onlyInKingdom)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function phylDistStruct=getPhylDist(keggPath,onlyInKingdom)
0002 % getPhylDist
0003 %   Calculates distance between species in KEGG based on systematic name
0004 %
0005 %   Input:
0006 %   keggPath        if keggPhylDist.mat is not in the RAVEN\external\kegg
0007 %                   directory, this function will attempt to read data from
0008 %                   a local FTP dump of the KEGG database. keggPath is the
0009 %                   path to the root of this database
0010 %   onlyInKingdom   if true, it generates a distance matrix with distance
0011 %                   Inf for organisms from another domains (Prokaryota,
0012 %                   Eukaryota) (optional, default false)
0013 %
0014 %   Output:
0015 %   phylDistStruct  a structure with a list of organism ids and a matrix
0016 %                   that specifies their pairwise distances
0017 %
0018 %   NOTE: This simple metric is based on the number of nodes two organisms
0019 %   are away from each other in KEGG
0020 %
0021 % Usage: phylDistStruct=getPhylDist(keggPath,onlyInKingdom)
0022 
0023 if nargin<1
0024     keggPath='RAVEN/external/kegg';
0025 else
0026     keggPath=char(keggPath);
0027 end
0028 if nargin<2
0029     onlyInKingdom=false;
0030 end
0031 
0032 %Check if the reactions have been parsed before and saved. If so, load the
0033 %model
0034 ravenPath=findRAVENroot();
0035 distFile=fullfile(ravenPath,'external','kegg','keggPhylDist.mat');
0036 if exist(distFile, 'file')
0037     fprintf(['Importing the KEGG phylogenetic distance matrix from ' strrep(distFile,'\','/') '... ']);
0038     load(distFile);
0039     fprintf('COMPLETE\n');
0040 else
0041     fprintf(['Cannot locate ' strrep(distFile,'\','/') '\n']);
0042     if ~isfile(fullfile(keggPath,'taxonomy'))
0043         EM=fprintf(['The file ''taxonomy'' cannot be located at ' strrep(keggPath,'\','/') '/ and should be downloaded from the KEGG FTP.\n']);
0044         dispEM(EM);
0045     else
0046         fprintf('Generating keggPhylDist.mat file... ');
0047         %Open the file that describes the naming of the species
0048         fid = fopen(fullfile(keggPath,'taxonomy'), 'r');
0049         
0050         phylDistStruct.ids={};
0051         phylDistStruct.names={};
0052         
0053         %Keeps the categories for each organism
0054         orgCat={};
0055         
0056         currentCat={};
0057         %Keeps track of the current category
0058         
0059         depth=0;
0060         %Keeps track of the current tree depth
0061         
0062         %Loop through the file
0063         orgCounter=0;
0064         while 1
0065             %Get the next line
0066             tline = fgetl(fid);
0067             
0068             %Abort at end of file
0069             if ~ischar(tline)
0070                 break;
0071             end
0072             
0073             if any(tline)
0074                 %Check if it's a new category
0075                 if tline(1)=='#'
0076                     %Find the first space (=depth +1)
0077                     sPos=strfind(tline,' ')-1;
0078                     %Should always exist
0079                     
0080                     sPos=sPos(1);
0081                     
0082                     %If we have stepped back one step in the tree
0083                     if sPos<depth
0084                         currentCat=currentCat(1:sPos);
0085                     end
0086                     depth=sPos;
0087                     
0088                     currentCat{depth}=tline(sPos+2:end);
0089                 else
0090                     orgCounter=orgCounter+1;
0091                     %It is an organism Get the id between first and second
0092                     %white space
0093                     sPos=find(isstrprop(tline, 'wspace'));
0094                     %Should always exist
0095                     
0096                     phylDistStruct.ids{orgCounter}=tline(sPos(1)+1:sPos(2)-1);
0097                     phylDistStruct.names{orgCounter}=tline(sPos(3)+1:end);
0098                     orgCat{orgCounter}=currentCat;
0099                 end
0100             end
0101         end
0102         %Generate a distance matrix (very straight forward here, not neat)
0103         phylDistStruct.distMat=zeros(numel(phylDistStruct.ids));
0104         phylDistStructOnlyInKingdom.distMat=zeros(numel(phylDistStruct.ids));
0105         phylDistStructOnlyInKingdom.ids=phylDistStruct.ids;
0106         for i=1:numel(phylDistStruct.ids)
0107             for j=1:numel(phylDistStruct.ids)
0108                 if ~strcmp(orgCat{i}(1),orgCat{j}(1))
0109                     phylDistStructOnlyInKingdom.distMat(i,j)=Inf;
0110                 end
0111                 %Calculate the distance between then
0112                 dist=numel(orgCat{i})-numel(orgCat{j});
0113                 if dist>0
0114                     aCat=orgCat{i}(1:end-dist);
0115                 else
0116                     aCat=orgCat{i};
0117                 end
0118                 if dist<0
0119                     bCat=orgCat{j}(1:end+dist);
0120                 else
0121                     bCat=orgCat{j};
0122                 end
0123                 
0124                 %Loop through the categories and stop when they are the
0125                 %same
0126                 for k=numel(aCat):-1:1
0127                     if strcmp(aCat{k},bCat{k})
0128                         break;
0129                     end
0130                 end
0131                 phylDistStruct.distMat(i,j)=dist+numel(aCat)-k;
0132             end
0133         end
0134         %Save the structure
0135         save(distFile,'phylDistStruct','phylDistStructOnlyInKingdom');
0136         fprintf('COMPLETE\n');
0137     end
0138 end
0139 if onlyInKingdom==true
0140     phylDistStruct=phylDistStructOnlyInKingdom;
0141 end
0142 end

Generated by m2html © 2005