Home > external > getBlast.m

getBlast

PURPOSE ^

getBlast

SYNOPSIS ^

function [blastStructure,blastReport]=getBlast(organismID,fastaFile,modelIDs,refFastaFiles,developMode,hideVerbose)

DESCRIPTION ^

 getBlast
   Performs a bidirectional BLAST between the organism of interest and a
   set of template organisms

   Input:
   organismID      the id of the organism of interest. This should also
                   match with the id supplied to getModelFromHomology
   fastaFile       a FASTA file with the protein sequences for the
                   organism of interest
   modelIDs        a cell array of model ids. These must match the
                   "model.id" fields in the "models" structure if the
                   output is to be used with getModelFromHomology
   refFastaFiles   a cell array with the paths to the corresponding FASTA
                   files
   developMode     true if blastReport should be generated that is used
                   in the unit testing function for BLAST+ (optional, default
                   false)
   hideVerbose     true if no status messages should be printed (optional,
                   default false)

   Output:
   blastStructure  structure containing the bidirectional homology
                   measurements that can be used by getModelFromHomology
   blastReport     structure containing MD5 hashes for FASTA database
                   files and non-parsed BLAST output data. Will be blank
                   if developMode is false.

   NOTE: This function calls BLAST+ to perform a bidirectional homology
   test between the organism of interest and a set of other organisms
   using standard settings. The only filtering this function does is the
   removal of hits with an E-value higher than 10e-5. The other homology
   measurements can be implemented using getBlastFromExcel.

 Usage: [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
    modelIDs,refFastaFiles,developMode,hideVerbose)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
0002     modelIDs,refFastaFiles,developMode,hideVerbose)
0003 % getBlast
0004 %   Performs a bidirectional BLAST between the organism of interest and a
0005 %   set of template organisms
0006 %
0007 %   Input:
0008 %   organismID      the id of the organism of interest. This should also
0009 %                   match with the id supplied to getModelFromHomology
0010 %   fastaFile       a FASTA file with the protein sequences for the
0011 %                   organism of interest
0012 %   modelIDs        a cell array of model ids. These must match the
0013 %                   "model.id" fields in the "models" structure if the
0014 %                   output is to be used with getModelFromHomology
0015 %   refFastaFiles   a cell array with the paths to the corresponding FASTA
0016 %                   files
0017 %   developMode     true if blastReport should be generated that is used
0018 %                   in the unit testing function for BLAST+ (optional, default
0019 %                   false)
0020 %   hideVerbose     true if no status messages should be printed (optional,
0021 %                   default false)
0022 %
0023 %   Output:
0024 %   blastStructure  structure containing the bidirectional homology
0025 %                   measurements that can be used by getModelFromHomology
0026 %   blastReport     structure containing MD5 hashes for FASTA database
0027 %                   files and non-parsed BLAST output data. Will be blank
0028 %                   if developMode is false.
0029 %
0030 %   NOTE: This function calls BLAST+ to perform a bidirectional homology
0031 %   test between the organism of interest and a set of other organisms
0032 %   using standard settings. The only filtering this function does is the
0033 %   removal of hits with an E-value higher than 10e-5. The other homology
0034 %   measurements can be implemented using getBlastFromExcel.
0035 %
0036 % Usage: [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
0037 %    modelIDs,refFastaFiles,developMode,hideVerbose)
0038 
0039 if nargin<5
0040     developMode = false;
0041 end
0042 if nargin<6
0043     hideVerbose = false;
0044 end
0045 
0046 %Everything should be cell arrays
0047 organismID=convertCharArray(organismID);
0048 fastaFile=convertCharArray(fastaFile);
0049 modelIDs=convertCharArray(modelIDs);
0050 refFastaFiles=convertCharArray(refFastaFiles);
0051 
0052 %Create blank structures for results
0053 blastStructure=[];
0054 blastReport.dbHashes.phr={};
0055 blastReport.dbHashes.pot={};
0056 blastReport.dbHashes.psq={};
0057 blastReport.dbHashes.pto={};
0058 blastReport.blastTxtOutput={};
0059 
0060 %Get the directory for RAVEN Toolbox
0061 ravenPath=findRAVENroot();
0062 
0063 %Generate temporary names for BLAST databases and output files
0064 tmpDB=tempname;
0065 outFile=tempname;
0066 if ispc && contains(tmpDB,' ')
0067     warning(['MATLAB assigned ''%s'' as temporary file path, but it '...
0068              'contains a space character, which is not compatible with '...
0069              'BLAST. Instead, a temporary folder will be initiated at '...
0070              '''C:\\tempRAVEN\\'' which should manually be removed when '...
0071              'finished.'],tmpDB)
0072     tmpDB=tempname('C:\tempRAVEN');
0073     outFile=tempname('C:\tempRAVEN');
0074 end
0075 
0076 %Check for existence of files. If no full path is specified for a file,
0077 %assume that it is in the current folder
0078 if isrow(refFastaFiles)
0079     files=horzcat(fastaFile,refFastaFiles);
0080 else
0081     files=vertcat(fastaFile,refFastaFiles);
0082 end
0083 
0084 files=checkFileExistence(files,2); %Copy files to temp dir
0085 fastaFile = files(1);
0086 refFastaFiles = files(2:end);
0087 
0088 %Identify the operating system
0089 if isunix
0090     if ismac
0091         binEnd='.mac';
0092     else
0093         binEnd='';
0094     end
0095 elseif ispc
0096     binEnd='.exe';
0097     setenv('BLASTDB_LMDB_MAP_SIZE','1000000');
0098 else
0099     dispEM('Unknown OS, exiting.')
0100     return
0101 end
0102 
0103 %Run BLAST multi-threaded to use all logical cores assigned to MATLAB
0104 cores = evalc('feature(''numcores'')');
0105 cores = strsplit(cores, 'MATLAB was assigned: ');
0106 cores = regexp(cores{2},'^\d*','match');
0107 cores = cores{1};
0108 
0109 %Create a database for the new organism and blast each of the refFastaFiles
0110 %against it
0111 [status, message]=system(['"' fullfile(ravenPath,'software','blast+',['makeblastdb' binEnd]) '" -in "' fastaFile{1} '" -out "' fullfile(tmpDB, 'tmpDB') '" -dbtype prot']);
0112 if developMode
0113     blastReport.dbHashes.phr{numel(blastReport.dbHashes.phr)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.phr'));
0114     blastReport.dbHashes.pot{numel(blastReport.dbHashes.pot)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pot'));
0115     blastReport.dbHashes.psq{numel(blastReport.dbHashes.psq)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.psq'));
0116     blastReport.dbHashes.pto{numel(blastReport.dbHashes.pto)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pto'));
0117 end
0118 if status~=0
0119     error('makeblastdb did not run successfully, error:\n%s',strip(message))
0120 end
0121 
0122 for i=1:numel(refFastaFiles)
0123     if ~hideVerbose
0124         fprintf(['BLASTing "' modelIDs{i} '" against "' organismID{1} '"..\n']);
0125     end
0126     [status, message]=system(['"' fullfile(ravenPath,'software','blast+',['blastp' binEnd]) '" -query "' refFastaFiles{i} '" -out "' outFile '_' num2str(i) '" -db "' fullfile(tmpDB, 'tmpDB') '" -evalue 10e-5 -outfmt "10 qseqid sseqid evalue pident length bitscore ppos" -num_threads "' cores '"']);
0127     if developMode
0128         blastReport.blastTxtOutput{numel(blastReport.blastTxtOutput)+1}=importdata([outFile '_' num2str(i)]);
0129     end
0130     if status~=0
0131         error('blastp did not run successfully, error:\n%s',strip(message))
0132     end
0133 end
0134 delete([tmpDB filesep 'tmpDB*']);
0135 
0136 %Then create a database for each of the reference organisms and blast the
0137 %new organism against them
0138 for i=1:numel(refFastaFiles)
0139     if ~hideVerbose
0140         fprintf(['BLASTing "' organismID{1} '" against "' modelIDs{i} '"..\n']);
0141     end
0142     [status, message]=system(['"' fullfile(ravenPath,'software','blast+',['makeblastdb' binEnd]) '" -in "' refFastaFiles{i} '" -out "' fullfile(tmpDB, 'tmpDB') '" -dbtype prot']);
0143     if status~=0
0144         error('makeblastdb did not run successfully, error:\n%s',strip(message))
0145     end
0146     [status, message]=system(['"' fullfile(ravenPath,'software','blast+',['blastp' binEnd]) '" -query "' fastaFile{1} '" -out "' outFile '_r' num2str(i) '" -db "' fullfile(tmpDB, 'tmpDB') '" -evalue 10e-5 -outfmt "10 qseqid sseqid evalue pident length bitscore ppos" -num_threads "' cores '"']);
0147     if developMode
0148         blastReport.dbHashes.phr{numel(blastReport.dbHashes.phr)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.phr'));
0149         blastReport.dbHashes.pot{numel(blastReport.dbHashes.pot)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pot'));
0150         blastReport.dbHashes.psq{numel(blastReport.dbHashes.psq)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.psq'));
0151         blastReport.dbHashes.pto{numel(blastReport.dbHashes.pto)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pto'));
0152         blastReport.blastTxtOutput{numel(blastReport.blastTxtOutput)+1}=importdata([outFile '_r' num2str(i)]);
0153     end    
0154     if status~=0
0155         error('blastp did not run successfully, error:\n%s',strip(message))
0156     end
0157     delete([tmpDB filesep 'tmpDB*']);
0158 end
0159 
0160 %Done with the BLAST, do the parsing of the text files
0161 for i=1:numel(refFastaFiles)*2
0162     tempStruct=[];
0163     if i<=numel(refFastaFiles)
0164         tempStruct.fromId=modelIDs{i};
0165         tempStruct.toId=organismID{1};
0166         A=readtable([outFile '_' num2str(i)],'Delimiter',',','Format','%s%s%f%f%f%f%f');
0167     else
0168         tempStruct.fromId=organismID{1};
0169         tempStruct.toId=modelIDs{i-numel(refFastaFiles)};
0170         A=readtable([outFile '_r' num2str(i-numel(refFastaFiles))],'Delimiter',',','Format','%s%s%f%f%f%f%f');
0171     end
0172     tempStruct.fromGenes=A{:,1};
0173     tempStruct.toGenes=A{:,2};
0174     tempStruct.evalue=table2array(A(:,3));
0175     tempStruct.identity=table2array(A(:,4));
0176     tempStruct.aligLen=table2array(A(:,5));
0177     tempStruct.bitscore=table2array(A(:,6));
0178     tempStruct.ppos=table2array(A(:,7));
0179     blastStructure=[blastStructure tempStruct];
0180 end
0181 
0182 %Remove the old tempfiles
0183 delete([outFile '*']);
0184 %Remove the temp fasta files
0185 delete(files{:})
0186 end

Generated by m2html © 2005