Home > external > getBlast.m

getBlast

PURPOSE ^

getBlast

SYNOPSIS ^

function [blastStructure,blastReport]=getBlast(organismID,fastaFile,modelIDs,refFastaFiles,developMode,hideVerbose)

DESCRIPTION ^

 getBlast
   Performs a bidirectional BLAST between the organism of interest and a
   set of template organisms

   Input:
   organismID      the id of the organism of interest. This should also
                   match with the id supplied to getModelFromHomology
   fastaFile       a FASTA file with the protein sequences for the
                   organism of interest
   modelIDs        a cell array of model ids. These must match the
                   "model.id" fields in the "models" structure if the
                   output is to be used with getModelFromHomology
   refFastaFiles   a cell array with the paths to the corresponding FASTA
                   files
   developMode     true if blastReport should be generated that is used
                   in the unit testing function for BLAST+ (opt, default
                   false)
   hideVerbose     true if no status messages should be printed (opt,
                   default false)

   Output:
   blastStructure  structure containing the bidirectional homology
                   measurements that can be used by getModelFromHomology
   blastReport     structure containing MD5 hashes for FASTA database
                   files and non-parsed BLAST output data. Will be blank
                   if developMode is false.

   NOTE: This function calls BLAST+ to perform a bidirectional homology
   test between the organism of interest and a set of other organisms
   using standard settings. The only filtering this function does is the
   removal of hits with an E-value higher than 10e-5. The other homology
   measurements can be implemented using getBlastFromExcel.

   Usage: [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
    modelIDs,refFastaFiles,developMode,hideVerbose)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
0002     modelIDs,refFastaFiles,developMode,hideVerbose)
0003 % getBlast
0004 %   Performs a bidirectional BLAST between the organism of interest and a
0005 %   set of template organisms
0006 %
0007 %   Input:
0008 %   organismID      the id of the organism of interest. This should also
0009 %                   match with the id supplied to getModelFromHomology
0010 %   fastaFile       a FASTA file with the protein sequences for the
0011 %                   organism of interest
0012 %   modelIDs        a cell array of model ids. These must match the
0013 %                   "model.id" fields in the "models" structure if the
0014 %                   output is to be used with getModelFromHomology
0015 %   refFastaFiles   a cell array with the paths to the corresponding FASTA
0016 %                   files
0017 %   developMode     true if blastReport should be generated that is used
0018 %                   in the unit testing function for BLAST+ (opt, default
0019 %                   false)
0020 %   hideVerbose     true if no status messages should be printed (opt,
0021 %                   default false)
0022 %
0023 %   Output:
0024 %   blastStructure  structure containing the bidirectional homology
0025 %                   measurements that can be used by getModelFromHomology
0026 %   blastReport     structure containing MD5 hashes for FASTA database
0027 %                   files and non-parsed BLAST output data. Will be blank
0028 %                   if developMode is false.
0029 %
0030 %   NOTE: This function calls BLAST+ to perform a bidirectional homology
0031 %   test between the organism of interest and a set of other organisms
0032 %   using standard settings. The only filtering this function does is the
0033 %   removal of hits with an E-value higher than 10e-5. The other homology
0034 %   measurements can be implemented using getBlastFromExcel.
0035 %
0036 %   Usage: [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
0037 %    modelIDs,refFastaFiles,developMode,hideVerbose)
0038 
0039 if nargin<5
0040     developMode = false;
0041 end
0042 if nargin<6
0043     hideVerbose = false;
0044 end
0045 
0046 %Everything should be cell arrays
0047 organismID=convertCharArray(organismID);
0048 fastaFile=convertCharArray(fastaFile);
0049 modelIDs=convertCharArray(modelIDs);
0050 refFastaFiles=convertCharArray(refFastaFiles);
0051 
0052 %Create blank structures for results
0053 blastStructure=[];
0054 blastReport.dbHashes.phr={};
0055 blastReport.dbHashes.pot={};
0056 blastReport.dbHashes.psq={};
0057 blastReport.dbHashes.pto={};
0058 blastReport.blastTxtOutput={};
0059 
0060 %Get the directory for RAVEN Toolbox
0061 ravenPath=findRAVENroot();
0062 
0063 %Generate temporary names for BLAST databases and output files
0064 tmpDB=tempname;
0065 outFile=tempname;
0066 
0067 %Check for existence of files. If no full path is specified for a file,
0068 %assume that it is in the current folder
0069 if isrow(refFastaFiles)
0070     files=horzcat(fastaFile,refFastaFiles);
0071 else
0072     files=vertcat(fastaFile,refFastaFiles);
0073 end
0074 
0075 files=checkFileExistence(files,2); %Copy files to temp dir
0076 fastaFile = files(1);
0077 refFastaFiles = files(2:end);
0078 
0079 %Identify the operating system
0080 if isunix
0081     if ismac
0082         binEnd='.mac';
0083     else
0084         binEnd='';
0085     end
0086 elseif ispc
0087     binEnd='.exe';
0088     setenv('BLASTDB_LMDB_MAP_SIZE','1000000');
0089 else
0090     dispEM('Unknown OS, exiting.')
0091     return
0092 end
0093 
0094 %Run BLAST multi-threaded to use all logical cores assigned to MATLAB
0095 cores = evalc('feature(''numcores'')');
0096 cores = strsplit(cores, 'MATLAB was assigned: ');
0097 cores = regexp(cores{2},'^\d*','match');
0098 cores = cores{1};
0099 
0100 %Create a database for the new organism and blast each of the refFastaFiles
0101 %against it
0102 [status, ~]=system(['"' fullfile(ravenPath,'software','blast+',['makeblastdb' binEnd]) '" -in ' fastaFile{1} ' -out "' fullfile(tmpDB, 'tmpDB') '" -dbtype prot']);
0103 if developMode
0104     blastReport.dbHashes.phr{numel(blastReport.dbHashes.phr)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.phr'));
0105     blastReport.dbHashes.pot{numel(blastReport.dbHashes.pot)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pot'));
0106     blastReport.dbHashes.psq{numel(blastReport.dbHashes.psq)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.psq'));
0107     blastReport.dbHashes.pto{numel(blastReport.dbHashes.pto)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pto'));
0108 end
0109 if status~=0
0110     EM=['makeblastdb did not run successfully, error: ', num2str(status)];
0111     dispEM(EM,true);
0112 end
0113 
0114 for i=1:numel(refFastaFiles)
0115     if ~hideVerbose
0116         fprintf(['BLASTing "' modelIDs{i} '" against "' organismID{1} '"..\n']);
0117     end
0118     [status, ~]=system(['"' fullfile(ravenPath,'software','blast+',['blastp' binEnd]) '" -query ' refFastaFiles{i} ' -out "' outFile '_' num2str(i) '" -db "' fullfile(tmpDB, 'tmpDB') '" -evalue 10e-5 -outfmt "10 qseqid sseqid evalue pident length bitscore ppos" -num_threads "' cores '"']);
0119     if developMode
0120         blastReport.blastTxtOutput{numel(blastReport.blastTxtOutput)+1}=importdata([outFile '_' num2str(i)]);
0121     end
0122     if status~=0
0123         EM=['blastp did not run successfully, error: ', num2str(status)];
0124         dispEM(EM,true);
0125     end
0126 end
0127 delete([tmpDB filesep 'tmpDB*']);
0128 
0129 %Then create a database for each of the reference organisms and blast the
0130 %new organism against them
0131 for i=1:numel(refFastaFiles)
0132     if ~hideVerbose
0133         fprintf(['BLASTing "' organismID{1} '" against "' modelIDs{i} '"..\n']);
0134     end
0135     [status, ~]=system(['"' fullfile(ravenPath,'software','blast+',['makeblastdb' binEnd]) '" -in ' refFastaFiles{i} ' -out "' fullfile(tmpDB, 'tmpDB') '" -dbtype prot']);
0136     if status~=0
0137         EM=['makeblastdb did not run successfully, error: ', num2str(status)];
0138         dispEM(EM,true);
0139     end
0140     [status, ~]=system(['"' fullfile(ravenPath,'software','blast+',['blastp' binEnd]) '" -query ' fastaFile{1} ' -out "' outFile '_r' num2str(i) '" -db "' fullfile(tmpDB, 'tmpDB') '" -evalue 10e-5 -outfmt "10 qseqid sseqid evalue pident length bitscore ppos" -num_threads "' cores '"']);
0141     if developMode
0142         blastReport.dbHashes.phr{numel(blastReport.dbHashes.phr)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.phr'));
0143         blastReport.dbHashes.pot{numel(blastReport.dbHashes.pot)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pot'));
0144         blastReport.dbHashes.psq{numel(blastReport.dbHashes.psq)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.psq'));
0145         blastReport.dbHashes.pto{numel(blastReport.dbHashes.pto)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pto'));
0146         blastReport.blastTxtOutput{numel(blastReport.blastTxtOutput)+1}=importdata([outFile '_r' num2str(i)]);
0147     end    
0148     if status~=0
0149         EM=['blastp did not run successfully, error: ', num2str(status)];
0150         dispEM(EM,true);
0151     end
0152     delete([tmpDB filesep 'tmpDB*']);
0153 end
0154 
0155 %Done with the BLAST, do the parsing of the text files
0156 for i=1:numel(refFastaFiles)*2
0157     tempStruct=[];
0158     if i<=numel(refFastaFiles)
0159         tempStruct.fromId=modelIDs{i};
0160         tempStruct.toId=organismID{1};
0161         A=readtable([outFile '_' num2str(i)],'Delimiter',',','Format','%s%s%f%f%f%f%f');
0162     else
0163         tempStruct.fromId=organismID{1};
0164         tempStruct.toId=modelIDs{i-numel(refFastaFiles)};
0165         A=readtable([outFile '_r' num2str(i-numel(refFastaFiles))],'Delimiter',',','Format','%s%s%f%f%f%f%f');
0166     end
0167     tempStruct.fromGenes=A{:,1};
0168     tempStruct.toGenes=A{:,2};
0169     tempStruct.evalue=table2array(A(:,3));
0170     tempStruct.identity=table2array(A(:,4));
0171     tempStruct.aligLen=table2array(A(:,5));
0172     tempStruct.bitscore=table2array(A(:,6));
0173     tempStruct.ppos=table2array(A(:,7));
0174     blastStructure=[blastStructure tempStruct];
0175 end
0176 
0177 %Remove the old tempfiles
0178 delete([outFile '*']);
0179 %Remove the temp fasta files
0180 delete(files{:})
0181 end

Generated by m2html © 2005