Home > external > getDiamond.m

getDiamond

PURPOSE ^

getDiamond

SYNOPSIS ^

function [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,modelIDs,refFastaFiles,developMode,hideVerbose)

DESCRIPTION ^

 getDiamond
   Uses DIAMOND to perform a bidirectional BLAST between the organism
   of interest and a set of template organisms

   Input:
   organismID      the id of the organism of interest. This should also
                   match with the id supplied to getModelFromHomology
   fastaFile       a FASTA file with the protein sequences for the
                   organism of interest
   modelIDs        a cell array of model ids. These must match the
                   "model.id" fields in the "models" structure if the
                   output is to be used with getModelFromHomology
   refFastaFiles   a cell array with the paths to the corresponding FASTA
                   files
   developMode     true if blastReport should be generated that is used
                   in the unit testing function for DIAMOND (opt, default
                   false)
   hideVerbose     true if no status messages should be printed (opt,
                   default false)

   Output:
   blastStructure  structure containing the bidirectional homology
                   measurements which are used by getModelFromHomology
   diamondReport   structure containing MD5 hashes for FASTA database
                   files and non-parsed BLAST output data. Will be blank
                   if developMode is false.

   NOTE: This function calls DIAMOND to perform a bidirectional homology
   search between the organism of interest and a set of other organisms
   using the '--more-sensitive' setting from DIAMOND. For the most
   sensitive results, the use of getBlast() is adviced, however,
   getDiamond() is a fast alternative (>15x faster). The blastStructure
   generated is in the same format as those obtained from getBlast().

   Usage: [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,...
    modelIDs,refFastaFiles,developMode,hideVerbose)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,...
0002     modelIDs,refFastaFiles,developMode,hideVerbose)
0003 % getDiamond
0004 %   Uses DIAMOND to perform a bidirectional BLAST between the organism
0005 %   of interest and a set of template organisms
0006 %
0007 %   Input:
0008 %   organismID      the id of the organism of interest. This should also
0009 %                   match with the id supplied to getModelFromHomology
0010 %   fastaFile       a FASTA file with the protein sequences for the
0011 %                   organism of interest
0012 %   modelIDs        a cell array of model ids. These must match the
0013 %                   "model.id" fields in the "models" structure if the
0014 %                   output is to be used with getModelFromHomology
0015 %   refFastaFiles   a cell array with the paths to the corresponding FASTA
0016 %                   files
0017 %   developMode     true if blastReport should be generated that is used
0018 %                   in the unit testing function for DIAMOND (opt, default
0019 %                   false)
0020 %   hideVerbose     true if no status messages should be printed (opt,
0021 %                   default false)
0022 %
0023 %   Output:
0024 %   blastStructure  structure containing the bidirectional homology
0025 %                   measurements which are used by getModelFromHomology
0026 %   diamondReport   structure containing MD5 hashes for FASTA database
0027 %                   files and non-parsed BLAST output data. Will be blank
0028 %                   if developMode is false.
0029 %
0030 %   NOTE: This function calls DIAMOND to perform a bidirectional homology
0031 %   search between the organism of interest and a set of other organisms
0032 %   using the '--more-sensitive' setting from DIAMOND. For the most
0033 %   sensitive results, the use of getBlast() is adviced, however,
0034 %   getDiamond() is a fast alternative (>15x faster). The blastStructure
0035 %   generated is in the same format as those obtained from getBlast().
0036 %
0037 %   Usage: [blastStructure,diamondReport]=getDiamond(organismID,fastaFile,...
0038 %    modelIDs,refFastaFiles,developMode,hideVerbose)
0039 
0040 if nargin<5
0041     developMode = false;
0042 end
0043 if nargin<6
0044     hideVerbose = false;
0045 end
0046 
0047 %Everything should be cell arrays
0048 organismID=convertCharArray(organismID);
0049 fastaFile=convertCharArray(fastaFile);
0050 modelIDs=convertCharArray(modelIDs);
0051 refFastaFiles=convertCharArray(refFastaFiles);
0052 
0053 %Create blank structures for results
0054 blastStructure=[];
0055 diamondReport.dbHashes={};
0056 diamondReport.diamondTxtOutput={};
0057 
0058 %Get the directory for RAVEN Toolbox.
0059 ravenPath=findRAVENroot();
0060 
0061 %Generate temporary names for DIAMOND databases and output files
0062 tmpDB=tempname;
0063 outFile=tempname;
0064 
0065 %Check for existence of files. If no full path is specified for a file,
0066 %assume that it is in the current folder
0067 if isrow(refFastaFiles)
0068     files=horzcat(fastaFile,refFastaFiles);
0069 else
0070     files=vertcat(fastaFile,refFastaFiles);
0071 end
0072 
0073 files=checkFileExistence(files,2); %Copy files to temp dir
0074 fastaFile = files(1);
0075 refFastaFiles = files(2:end);
0076 
0077 %Identify the operating system
0078 if isunix
0079     if ismac
0080         binEnd='.mac';
0081     else
0082         binEnd='';
0083     end
0084 elseif ispc
0085     binEnd='.exe';
0086 else
0087     dispEM('Unknown OS, exiting.')
0088     return
0089 end
0090 
0091 %Run DIAMOND multi-threaded to use all logical cores assigned to MATLAB.
0092 cores = evalc('feature(''numcores'')');
0093 cores = strsplit(cores, 'MATLAB was assigned: ');
0094 cores = regexp(cores{2},'^\d*','match');
0095 cores = cores{1};
0096 
0097 %Create a database for the new organism and blast each of the refFastaFiles
0098 %against it
0099 [status, message]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" makedb --in "' fastaFile{1} '" --db "' fullfile(tmpDB) '"']);
0100 if developMode
0101     diamondReport.dbHashes{numel(diamondReport.dbHashes)+1} = char(regexp(message,'[a-f0-9]{32}','match'));
0102 end
0103 if status~=0
0104     EM=['DIAMOND makedb did not run successfully, error: ', num2str(status)];
0105     dispEM(EM,true);
0106 end
0107 
0108 for i=1:numel(refFastaFiles)
0109     if ~hideVerbose
0110         fprintf(['Running DIAMOND blastp with "' modelIDs{i} '" against "' organismID{1} '"..\n']);
0111     end
0112     [status, ~]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" blastp --query "' refFastaFiles{i} '" --out "' outFile '_' num2str(i) '" --db "' fullfile(tmpDB) '" --more-sensitive --outfmt 6 qseqid sseqid evalue pident length bitscore ppos --threads ' cores ]);
0113     if developMode
0114         diamondReport.diamondTxtOutput{numel(diamondReport.diamondTxtOutput)+1}=importdata([outFile '_' num2str(i)]);
0115     end
0116     if status~=0
0117         EM=['DIAMOND blastp did not run successfully, error: ', num2str(status)];
0118         dispEM(EM,true);
0119     end
0120 end
0121 delete([tmpDB filesep 'tmpDB*']);
0122 
0123 %Then create a database for each of the reference organisms and blast the
0124 %new organism against them
0125 for i=1:numel(refFastaFiles)
0126     if ~hideVerbose
0127         fprintf(['Running DIAMOND blastp with "' organismID{1} '" against "' modelIDs{i} '"..\n']);
0128     end
0129     [status, message]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" makedb --in "' refFastaFiles{i} '" --db "' fullfile(tmpDB) '"']);
0130     if status~=0
0131         EM=['DIAMOND makedb did not run successfully, error: ', num2str(status)];
0132         dispEM(EM,true);
0133     end
0134     [status, ~]=system(['"' fullfile(ravenPath,'software','diamond',['diamond' binEnd]) '" blastp --query "' fastaFile{1} '" --out "' outFile '_r' num2str(i) '" --db "' fullfile(tmpDB) '" --more-sensitive --outfmt 6 qseqid sseqid evalue pident length bitscore ppos --threads ' cores]);
0135     if developMode
0136         diamondReport.dbHashes{numel(diamondReport.dbHashes)+1} = char(regexp(message,'[a-f0-9]{32}','match'));
0137         diamondReport.diamondTxtOutput{numel(diamondReport.diamondTxtOutput)+1}=importdata([outFile '_r' num2str(i)]);
0138     end
0139     if status~=0
0140         EM=['DIAMOND blastp did not run successfully, error: ', num2str(status)];
0141         dispEM(EM,true);
0142     end
0143     delete([tmpDB filesep 'tmpDB*']);
0144 end
0145 
0146 %Done with the DIAMOND blastp, do the parsing of the text files
0147 for i=1:numel(refFastaFiles)*2
0148     tempStruct=[];
0149     if i<=numel(refFastaFiles)
0150         tempStruct.fromId=modelIDs{i};
0151         tempStruct.toId=organismID{1};
0152         A=readtable([outFile '_' num2str(i)],'Delimiter','\t','Format','%s%s%f%f%f%f%f');
0153     else
0154         tempStruct.fromId=organismID{1};
0155         tempStruct.toId=modelIDs{i-numel(refFastaFiles)};
0156         A=readtable([outFile '_r' num2str(i-numel(refFastaFiles))],'Delimiter','\t','Format','%s%s%f%f%f%f%f');
0157     end
0158     tempStruct.fromGenes=A{:,1};
0159     tempStruct.toGenes=A{:,2};
0160     tempStruct.evalue=table2array(A(:,3));
0161     tempStruct.identity=table2array(A(:,4));
0162     tempStruct.aligLen=table2array(A(:,5));
0163     tempStruct.bitscore=table2array(A(:,6));
0164     tempStruct.ppos=table2array(A(:,7));
0165     blastStructure=[blastStructure tempStruct];
0166 end
0167 
0168 %Remove the old tempfiles
0169 delete([outFile '*']);
0170 %Remove the temp fasta files
0171 delete(files{:})
0172 end

Generated by m2html © 2005