Home > external > parseScores.m

parseScores

PURPOSE ^

parseScores

SYNOPSIS ^

function GSS = parseScores(inputFile, predictor)

DESCRIPTION ^

 parseScores
    Parse the output from a predictor to generate the GSS

    Input:
    inputFile    a file with the output from the predictor
    predictor    the predictor that was used. 'wolf' for WoLF PSORT, 'cello'
               for CELLO, 'deeploc' for DeepLoc (optional, default 'wolf')

    Output:
    GSS         a gene scoring structure to be used in predictLocalization

    The function normalizes the scores so that the best score for each gene
    is 1.0.

    Usage: GSS = parseScores(inputFile, predictor)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function GSS = parseScores(inputFile, predictor)
0002 % parseScores
0003 %    Parse the output from a predictor to generate the GSS
0004 %
0005 %    Input:
0006 %    inputFile    a file with the output from the predictor
0007 %    predictor    the predictor that was used. 'wolf' for WoLF PSORT, 'cello'
0008 %               for CELLO, 'deeploc' for DeepLoc (optional, default 'wolf')
0009 %
0010 %    Output:
0011 %    GSS         a gene scoring structure to be used in predictLocalization
0012 %
0013 %    The function normalizes the scores so that the best score for each gene
0014 %    is 1.0.
0015 %
0016 %    Usage: GSS = parseScores(inputFile, predictor)
0017 
0018 if nargin<2
0019     predictor='wolf';
0020 else
0021     predictor=char(predictor);
0022 end
0023 
0024 fid=fopen(inputFile,'r');
0025 
0026 if fid<1
0027     EM='Could not open file';
0028     dispEM(EM);
0029 end
0030 
0031 if strcmpi(predictor,'wolf')
0032     A=textscan(fid,'%s','Delimiter','\n','CommentStyle','#');
0033     
0034     %Each element should be for one gene, but some of them are on the form
0035     %"Pc20g11350: treating 9 X's as Glycines". Those should be removed
0036     I=~cellfun(@any,strfind(A{1},'treating'));
0037     
0038     B=regexp(A{1}(I),' ','split');
0039     
0040     %Reserve space for stuff
0041     GSS.compartments={};
0042     GSS.scores=[]; %Do not know number of comps yet
0043     GSS.genes=cell(numel(B),1);
0044     
0045     %Parsing is a bit cumbersome as ', ' is used as a delimiter in some
0046     %cases and ' ' in others. Use strrep to get rid of ','
0047     for i=1:numel(B)
0048         b=strrep(B{i},',','');
0049         GSS.genes{i}=b{1};
0050         
0051         %Then go through the compartments and add new ones as they are
0052         %found
0053         for j=2:2:numel(b)-1
0054             [~, J]=ismember(b(j),GSS.compartments);
0055             
0056             %Add new compartment if it does not exist
0057             if J==0
0058                 GSS.compartments=[GSS.compartments;b(j)];
0059                 J=numel(GSS.compartments);
0060                 GSS.scores=[GSS.scores zeros(numel(B),1)];
0061             end
0062             
0063             GSS.scores(i,J)=str2double(b(j+1));
0064         end
0065     end
0066 elseif strcmpi(predictor,'cello')
0067     fid=fopen(inputFile,'r');
0068     %Read the title line and fetch the list of compartments
0069     tline = fgetl(fid);
0070     tline=regexprep(tline,'^.+#Combined:\t','');
0071     tline=regexprep(tline,'\t#Most-likely-Location.+','');
0072     GSS.compartments=transpose(regexp(tline,'\t','split'));
0073     
0074     %Now iterate through the following lines in the file. Each row
0075     %corresponds to one gene and it consists of the scores for
0076     %compartments. Gene name is in the end of each line
0077     row=0;
0078     while 1
0079         row=row+1;
0080         tline = fgetl(fid);
0081         if ~ischar(tline)
0082             break;
0083         end
0084         tline=regexprep(tline,'^.+:\t','');
0085         tline=regexprep(tline,' .+','');
0086         tline=regexp(tline,'\t','split');
0087         GSS.scores(row,:)=str2double(tline(1:numel(GSS.compartments)));
0088         GSS.genes{row,1}=tline{1,end};
0089     end
0090 elseif strcmpi(predictor,'deeploc')
0091     fid=fopen(inputFile,'r');
0092     %Read the title line and fetch the list of compartments
0093     tline = fgetl(fid);
0094     GSS.compartments=regexp(tline,',','split');
0095     GSS.compartments=GSS.compartments(4:end);
0096     
0097     %Now iterate through the following lines in the file. Each row
0098     %corresponds to one gene and it consists of the scores for
0099     %compartments. Gene name is in the end of each line
0100     row=0;
0101     while 1
0102         row=row+1;
0103         tline = fgetl(fid);
0104         if ~ischar(tline)
0105             break;
0106         end
0107         tline=regexp(tline,',','split');
0108         GSS.scores(row,:)=str2double(tline(4:end));
0109         GSS.genes{row,1}=tline{1,1};
0110     end
0111 end
0112 
0113 %Check if there are duplicate genes
0114 [~, J, K]=unique(GSS.genes);
0115 
0116 if numel(J)~=numel(K)
0117     EM='There are duplicate genes in the input file';
0118     dispEM(EM,false);
0119     GSS.genes=GSS.genes(J);
0120     GSS.scores=GSS.scores(J,:);
0121 end
0122 
0123 %Normalize
0124 I=max(GSS.scores,[],2);
0125 GSS.scores=bsxfun(@times, GSS.scores, 1./I);
0126 
0127 fclose(fid);
0128 end

Generated by m2html © 2005