Home > external > getDiamond.m

getDiamond

PURPOSE ^

getDiamond

SYNOPSIS ^

function [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,modelIDs,refFastaFiles,developMode,hideVerbose)

DESCRIPTION ^

 getDiamond
   Uses DIAMOND to perform a bidirectional BLAST between the organism
   of interest and a set of template organisms

   Input:
   organismID      the id of the organism of interest. This should also
                   match with the id supplied to getModelFromHomology
   fastaFile       a FASTA file with the protein sequences for the
                   organism of interest
   modelIDs        a cell array of model ids. These must match the
                   "model.id" fields in the "models" structure if the
                   output is to be used with getModelFromHomology
   refFastaFiles   a cell array with the paths to the corresponding FASTA
                   files
   developMode     true if blastReport should be generated that is used
                   in the unit testing function for DIAMOND (optional, default
                   false)
   hideVerbose     true if no status messages should be printed (optional,
                   default false)

   Output:
   blastStructure  structure containing the bidirectional homology
                   measurements which are used by getModelFromHomology
   diamondReport   structure containing MD5 hashes for FASTA database
                   files and non-parsed BLAST output data. Will be blank
                   if developMode is false.

   NOTE: This function calls DIAMOND to perform a bidirectional homology
   search between the organism of interest and a set of other organisms
   using the '--more-sensitive' setting from DIAMOND. For the most
   sensitive results, the use of getBlast() is adviced, however,
   getDiamond() is a fast alternative (>15x faster). The blastStructure
   generated is in the same format as those obtained from getBlast().

 Usage: [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,...
    modelIDs,refFastaFiles,developMode,hideVerbose)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,...
0002     modelIDs,refFastaFiles,developMode,hideVerbose)
0003 % getDiamond
0004 %   Uses DIAMOND to perform a bidirectional BLAST between the organism
0005 %   of interest and a set of template organisms
0006 %
0007 %   Input:
0008 %   organismID      the id of the organism of interest. This should also
0009 %                   match with the id supplied to getModelFromHomology
0010 %   fastaFile       a FASTA file with the protein sequences for the
0011 %                   organism of interest
0012 %   modelIDs        a cell array of model ids. These must match the
0013 %                   "model.id" fields in the "models" structure if the
0014 %                   output is to be used with getModelFromHomology
0015 %   refFastaFiles   a cell array with the paths to the corresponding FASTA
0016 %                   files
0017 %   developMode     true if blastReport should be generated that is used
0018 %                   in the unit testing function for DIAMOND (optional, default
0019 %                   false)
0020 %   hideVerbose     true if no status messages should be printed (optional,
0021 %                   default false)
0022 %
0023 %   Output:
0024 %   blastStructure  structure containing the bidirectional homology
0025 %                   measurements which are used by getModelFromHomology
0026 %   diamondReport   structure containing MD5 hashes for FASTA database
0027 %                   files and non-parsed BLAST output data. Will be blank
0028 %                   if developMode is false.
0029 %
0030 %   NOTE: This function calls DIAMOND to perform a bidirectional homology
0031 %   search between the organism of interest and a set of other organisms
0032 %   using the '--more-sensitive' setting from DIAMOND. For the most
0033 %   sensitive results, the use of getBlast() is adviced, however,
0034 %   getDiamond() is a fast alternative (>15x faster). The blastStructure
0035 %   generated is in the same format as those obtained from getBlast().
0036 %
0037 % Usage: [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,...
0038 %    modelIDs,refFastaFiles,developMode,hideVerbose)
0039 
0040 if nargin<5
0041     developMode = false;
0042 end
0043 if nargin<6
0044     hideVerbose = false;
0045 end
0046 
0047 %Everything should be cell arrays
0048 organismID=convertCharArray(organismID);
0049 fastaFile=convertCharArray(fastaFile);
0050 modelIDs=convertCharArray(modelIDs);
0051 refFastaFiles=convertCharArray(refFastaFiles);
0052 
0053 %Create blank structures for results
0054 blastStructure=[];
0055 diamondReport.dbHashes={};
0056 diamondReport.diamondTxtOutput={};
0057 
0058 %Get the directory for RAVEN Toolbox.
0059 ravenPath=findRAVENroot();
0060 
0061 %Generate temporary names for DIAMOND databases and output files
0062 tmpDB=tempname;
0063 outFile=tempname;
0064 
0065 %Check for existence of files. If no full path is specified for a file,
0066 %assume that it is in the current folder
0067 if isrow(refFastaFiles)
0068     files=horzcat(fastaFile,refFastaFiles);
0069 else
0070     files=vertcat(fastaFile,refFastaFiles);
0071 end
0072 
0073 files=checkFileExistence(files,2); %Copy files to temp dir
0074 fastaFile = files(1);
0075 refFastaFiles = files(2:end);
0076 
0077 %Identify the operating system
0078 if isunix
0079     if ismac
0080         binEnd='.mac';
0081     else
0082         binEnd='';
0083     end
0084 elseif ispc
0085     binEnd='.exe';
0086 else
0087     dispEM('Unknown OS, exiting.')
0088     return
0089 end
0090 
0091 %Run DIAMOND multi-threaded to use all logical cores assigned to MATLAB.
0092 cores = evalc('feature(''numcores'')');
0093 cores = strsplit(cores, 'MATLAB was assigned: ');
0094 cores = regexp(cores{2},'^\d*','match');
0095 cores = cores{1};
0096 
0097 %Create a database for the new organism and blast each of the refFastaFiles
0098 %against it
0099 [status, message]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" makedb --in "' fastaFile{1} '" --db "' fullfile(tmpDB) '"']);
0100 if developMode
0101     diamondReport.dbHashes{numel(diamondReport.dbHashes)+1} = char(regexp(message,'[a-f0-9]{32}','match'));
0102 end
0103 if status~=0
0104     error('DIAMOND makedb did not run successfully, error:\n%s',strip(message))
0105 end
0106 
0107 for i=1:numel(refFastaFiles)
0108     if ~hideVerbose
0109         fprintf(['Running DIAMOND blastp with "' modelIDs{i} '" against "' organismID{1} '"..\n']);
0110     end
0111     [status, message]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" blastp --query "' refFastaFiles{i} '" --out "' outFile '_' num2str(i) '" --db "' fullfile(tmpDB) '" --more-sensitive --outfmt 6 qseqid sseqid evalue pident length bitscore ppos --threads ' cores ]);
0112     if developMode
0113         diamondReport.diamondTxtOutput{numel(diamondReport.diamondTxtOutput)+1}=importdata([outFile '_' num2str(i)]);
0114     end
0115     if status~=0
0116         error('DIAMOND blastp did not run successfully, error:\n%s',strip(message))
0117     end
0118 end
0119 delete([tmpDB filesep 'tmpDB*']);
0120 
0121 %Then create a database for each of the reference organisms and blast the
0122 %new organism against them
0123 for i=1:numel(refFastaFiles)
0124     if ~hideVerbose
0125         fprintf(['Running DIAMOND blastp with "' organismID{1} '" against "' modelIDs{i} '"..\n']);
0126     end
0127     [status, message1]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" makedb --in "' refFastaFiles{i} '" --db "' fullfile(tmpDB) '"']);
0128     if status~=0
0129         error('DIAMOND makedb did not run successfully, error:\n%s',strip(message1))
0130     end
0131     [status, message]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" blastp --query "' fastaFile{1} '" --out "' outFile '_r' num2str(i) '" --db "' fullfile(tmpDB) '" --more-sensitive --outfmt 6 qseqid sseqid evalue pident length bitscore ppos --threads ' cores]);
0132     if developMode
0133         diamondReport.dbHashes{numel(diamondReport.dbHashes)+1} = char(regexp(message1,'[a-f0-9]{32}','match'));
0134         diamondReport.diamondTxtOutput{numel(diamondReport.diamondTxtOutput)+1}=importdata([outFile '_r' num2str(i)]);
0135     end
0136     if status~=0
0137         error('DIAMOND blastp did not run successfully, error:\n%s',strip(message))
0138     end
0139     delete([tmpDB filesep 'tmpDB*']);
0140 end
0141 
0142 %Done with the DIAMOND blastp, do the parsing of the text files
0143 for i=1:numel(refFastaFiles)*2
0144     tempStruct=[];
0145     if i<=numel(refFastaFiles)
0146         tempStruct.fromId=modelIDs{i};
0147         tempStruct.toId=organismID{1};
0148         A=readtable([outFile '_' num2str(i)],'Delimiter','\t','Format','%s%s%f%f%f%f%f');
0149     else
0150         tempStruct.fromId=organismID{1};
0151         tempStruct.toId=modelIDs{i-numel(refFastaFiles)};
0152         A=readtable([outFile '_r' num2str(i-numel(refFastaFiles))],'Delimiter','\t','Format','%s%s%f%f%f%f%f');
0153     end
0154     tempStruct.fromGenes=A{:,1};
0155     tempStruct.toGenes=A{:,2};
0156     tempStruct.evalue=table2array(A(:,3));
0157     tempStruct.identity=table2array(A(:,4));
0158     tempStruct.aligLen=table2array(A(:,5));
0159     tempStruct.bitscore=table2array(A(:,6));
0160     tempStruct.ppos=table2array(A(:,7));
0161     blastStructure=[blastStructure tempStruct];
0162 end
0163 
0164 %Remove the old tempfiles
0165 delete([outFile '*']);
0166 %Remove the temp fasta files
0167 delete(files{:})
0168 end

Generated by m2html © 2005