Home > external > getBlastFromExcel.m

getBlastFromExcel

PURPOSE ^

getBlastFromExcel

SYNOPSIS ^

function blastStructure=getBlastFromExcel(models,blastFile,organismId)

DESCRIPTION ^

 getBlastFromExcel
   Retrieves gene homology information from Excel files. Used as
   input to getModelFromHomology.

   Input:
   models          a cell array of model structures
   blastFile       Excel file with homology information
   organismId      the id of the organism of interest (as described in the
                   Excel file)

   Output:
   blastStructure  structure containing the information in the Excel
                   sheets.

   The Excel file should contain a number of spreadsheets which in turn
   contain the bidirectional homology measurements between the genes in the
   organisms. The first and second column headers in each sheet is the
   "to" and "from" model ids (as defined in models or for the new organism).
   The entries should correspond to the gene names in those models. The third,
   fourth, fifth, sixth and seventh columns represent the E-value, alignment
   length, identity, bitscore and percentage of positive-scoring matches for
   each measurement (captions should be "E-value", "Alignment length",
   "Identity", "Bitscore" and "PPOS").

 Usage: blastStructure=getBlastFromExcel(models,blastFile,organismId)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function blastStructure=getBlastFromExcel(models,blastFile,organismId)
0002 % getBlastFromExcel
0003 %   Retrieves gene homology information from Excel files. Used as
0004 %   input to getModelFromHomology.
0005 %
0006 %   Input:
0007 %   models          a cell array of model structures
0008 %   blastFile       Excel file with homology information
0009 %   organismId      the id of the organism of interest (as described in the
0010 %                   Excel file)
0011 %
0012 %   Output:
0013 %   blastStructure  structure containing the information in the Excel
0014 %                   sheets.
0015 %
0016 %   The Excel file should contain a number of spreadsheets which in turn
0017 %   contain the bidirectional homology measurements between the genes in the
0018 %   organisms. The first and second column headers in each sheet is the
0019 %   "to" and "from" model ids (as defined in models or for the new organism).
0020 %   The entries should correspond to the gene names in those models. The third,
0021 %   fourth, fifth, sixth and seventh columns represent the E-value, alignment
0022 %   length, identity, bitscore and percentage of positive-scoring matches for
0023 %   each measurement (captions should be "E-value", "Alignment length",
0024 %   "Identity", "Bitscore" and "PPOS").
0025 %
0026 % Usage: blastStructure=getBlastFromExcel(models,blastFile,organismId)
0027 
0028 if ~isfile(blastFile)
0029     error('BLAST result file %s cannot be found',string(blastFile));
0030 end
0031 organismId=char(organismId);
0032 
0033 blastStructure=[];
0034 
0035 %Get a list of model IDs
0036 organisms=cell(numel(models)+1,1);
0037 organisms{1}=organismId;
0038 for i=1:numel(models)
0039     organisms{i+1}=models{i}.id;
0040 end
0041 
0042 %Get all the spreadsheets in the file
0043 [type, sheets]=xlsfinfo(blastFile);
0044 
0045 %Check if the file is a Microsoft Excel Spreadsheet
0046 if ~any(regexp(type,'Excel Spreadsheet'))
0047     EM='The file is not a Microsoft Excel Spreadsheet';
0048     dispEM(EM);
0049 end
0050 
0051 for i=1:numel(sheets)
0052     %Check if the sheet has the right header and deal with organisms that
0053     %are in "models"
0054     [values,dataSheet]=xlsread(blastFile,i);
0055     labels=dataSheet(1,:);
0056     if strcmpi(labels{3},'E-value') && strcmpi(labels{4},'Alignment length') ...
0057             && strcmpi(labels{5},'Identity') && strcmpi(labels{6},'Bitscore') ...
0058             && strcmpi(labels{7},'PPOS')
0059         %At least one of the organisms must have a model
0060         fromID=find(strcmpi(labels{1},organisms));
0061         toID=find(strcmpi(labels{2},organisms));
0062         %Check that the organism ids exist and that one of them is the
0063         %organism of interest
0064         if any(fromID) && any(toID) && (toID==1 || fromID==1)
0065             %Check that no gene ids are empty. This could for example be
0066             %the case if the gene names are wrongly formatted as numbers
0067             %instead of strings
0068             emptyNames=cellfun(@isempty,dataSheet(2:end,1)) | cellfun(@isempty,dataSheet(2:end,2));
0069             if any(emptyNames)
0070                 if all(emptyNames)
0071                     EM=['Only empty gene names in sheet from ' organisms{fromID} ' to ' organisms{toID}];
0072                     dispEM(EM);
0073                 else
0074                     EM=['Empty gene names in sheet from ' organisms{fromID} ' to ' organisms{toID} '. Ignoring genes with empty names'];
0075                     dispEM(EM,false);
0076                 end
0077             end
0078             blastStructure(numel(blastStructure)+1).toId=organisms{toID};
0079             blastStructure(numel(blastStructure)).fromId=organisms{fromID};
0080             blastStructure(numel(blastStructure)).fromGenes=dataSheet(2:end,1);
0081             blastStructure(numel(blastStructure)).toGenes=dataSheet(2:end,2);
0082             blastStructure(numel(blastStructure)).evalue=values(:,1);
0083             blastStructure(numel(blastStructure)).aligLen=values(:,2);
0084             blastStructure(numel(blastStructure)).identity=values(:,3);
0085             blastStructure(numel(blastStructure)).bitscore=values(:,4);
0086             blastStructure(numel(blastStructure)).ppos=values(:,5);
0087             
0088             %Remove matches where any of the values is NaN. This would have
0089             %been done anyways in getModelFromHomology, but it's neater to
0090             %do it here
0091             I=isnan(blastStructure(end).evalue) | isnan(blastStructure(end).aligLen) | isnan(blastStructure(end).identity);
0092             blastStructure(end).fromGenes(I)=[];
0093             blastStructure(end).toGenes(I)=[];
0094             blastStructure(end).evalue(I)=[];
0095             blastStructure(end).aligLen(I)=[];
0096             blastStructure(end).identity(I)=[];
0097             blastStructure(end).bitscore(I)=[];
0098             blastStructure(end).ppos(I)=[];
0099         else
0100             if isempty(toID) || isempty(fromID)
0101                 EM=['The data in sheet ' sheets{i} ' has no corresponding model. Ignoring sheet'];
0102                 dispEM(EM,false);
0103             else
0104                 EM=['The data in sheet ' sheets{i} ' does not involve the organism of interest. Ignoring sheet'];
0105                 dispEM(EM,false);
0106             end
0107         end
0108     else
0109         EM=['The data in sheet ' sheets{i} ' is not correctly formatted. Ignoring sheet'];
0110         dispEM(EM,false);
0111     end
0112 end
0113 
0114 end

Generated by m2html © 2005