Home > hpa > parseHPA.m

parseHPA

PURPOSE ^

parseHPA

SYNOPSIS ^

function hpaData=parseHPA(fileName, version)

DESCRIPTION ^

 parseHPA
   Parses a database dump of the Human Protein Atlas (HPA)

   Input:
   fileName            comma- or tab-separated database dump of HPA. For details
                       regarding the format, see
                       http://www.proteinatlas.org/about/download.
   version             version of HPA [optional, default=19]


   Output:
   hpaData
       genes               cell array with the unique gene names. In
                           version >=18 this is the ensemble name, see
                           geneNames below for the names in ver >=18
       geneNames           cell array with the gene names, indexed the
                           same way as genes.
       tissues             cell array with the tissue names. The list may not be
                           unique, as there can be multiple cell types per tissue
       celltypes           cell array with the cell type names for each tissue
       levels              cell array with the unique expression levels
       types               cell array with the unique evidence types
       reliabilities       cell array with the unique reliability levels

       gene2Level          gene-to-expression level mapping in sparse matrix form.
                           The value for element i,j is the index in
                           hpaData.levels of gene i in cell type j
       gene2Type           gene-to-evidence type mapping in sparse matrix form.
                           The value for element i,j is the index in
                           hpaData.types of gene i in cell type j. Doesn't
                           exist in version >=18.
       gene2Reliability    gene-to-reliability level mapping in sparse matrix form.
                           The value for element i,j is the index in
                           hpaData.reliabilities of gene i in cell type j

 Usage: hpaData=parseHPA(fileName,version)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function hpaData=parseHPA(fileName, version)
0002 % parseHPA
0003 %   Parses a database dump of the Human Protein Atlas (HPA)
0004 %
0005 %   Input:
0006 %   fileName            comma- or tab-separated database dump of HPA. For details
0007 %                       regarding the format, see
0008 %                       http://www.proteinatlas.org/about/download.
0009 %   version             version of HPA [optional, default=19]
0010 %
0011 %
0012 %   Output:
0013 %   hpaData
0014 %       genes               cell array with the unique gene names. In
0015 %                           version >=18 this is the ensemble name, see
0016 %                           geneNames below for the names in ver >=18
0017 %       geneNames           cell array with the gene names, indexed the
0018 %                           same way as genes.
0019 %       tissues             cell array with the tissue names. The list may not be
0020 %                           unique, as there can be multiple cell types per tissue
0021 %       celltypes           cell array with the cell type names for each tissue
0022 %       levels              cell array with the unique expression levels
0023 %       types               cell array with the unique evidence types
0024 %       reliabilities       cell array with the unique reliability levels
0025 %
0026 %       gene2Level          gene-to-expression level mapping in sparse matrix form.
0027 %                           The value for element i,j is the index in
0028 %                           hpaData.levels of gene i in cell type j
0029 %       gene2Type           gene-to-evidence type mapping in sparse matrix form.
0030 %                           The value for element i,j is the index in
0031 %                           hpaData.types of gene i in cell type j. Doesn't
0032 %                           exist in version >=18.
0033 %       gene2Reliability    gene-to-reliability level mapping in sparse matrix form.
0034 %                           The value for element i,j is the index in
0035 %                           hpaData.reliabilities of gene i in cell type j
0036 %
0037 % Usage: hpaData=parseHPA(fileName,version)
0038 
0039 if nargin<2
0040     version=19; %Change this and add code for more versions when the current HPA version is increased and the format is changed
0041 end
0042 
0043 fileName=char(fileName);
0044 if ~isfile(fileName)
0045     error('HPA file %s cannot be found',string(fileName));
0046 end
0047 
0048 if (version >= 17)
0049     fid=fopen(fileName,'r');
0050     hpa=textscan(fid,'%q %q %q %q %q %q','Delimiter','\t');
0051     fclose(fid);
0052     
0053     %Go through and see if the headers match what was expected
0054     headers={'Gene' 'Gene name' 'Tissue' 'Cell type' 'Level' 'Reliability'};
0055     for i=1:numel(headers)
0056         if ~strcmpi(headers(i),hpa{i}(1))
0057             EM=['Could not find the header "' headers{i} '". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download'];
0058             dispEM(EM);
0059         end
0060         %Remove the header line here
0061         hpa{i}(1)=[];
0062     end
0063     
0064     %Get the unique values of each data type
0065     [hpaData.genes, P, I]=unique(hpa{1});
0066     hpaData.geneNames=hpa{2}(P); %make this vector use the index as genes
0067     [~, J, K]=unique(strcat(hpa{3},'€',hpa{4}));
0068     hpaData.tissues=hpa{3}(J);
0069     hpaData.celltypes=hpa{4}(J);
0070     [hpaData.levels, ~, L]=unique(hpa{5});
0071     [hpaData.reliabilities, ~, N]=unique(hpa{6});
0072     
0073     %Map the data to be sparse matrises instead
0074     hpaData.gene2Level=sparse(I,K,L,numel(hpaData.genes),numel(hpaData.tissues));
0075     hpaData.gene2Reliability=sparse(I,K,N,numel(hpaData.genes),numel(hpaData.tissues));
0076 else
0077     fid=fopen(fileName,'r');
0078     hpa=textscan(fid,'%q %q %q %q %q %q','Delimiter',',');
0079     fclose(fid);
0080     
0081     %Go through and see if the headers match what was expected
0082     headers={'Gene' 'Tissue' 'Cell type' 'Level' 'Expression type' 'Reliability'};
0083     for i=1:numel(headers)
0084         if ~strcmpi(headers(i),hpa{i}(1))
0085             EM=['Could not find the header "' headers{i} '". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download'];
0086             dispEM(EM);
0087         end
0088         %Remove the header line here
0089         hpa{i}(1)=[];
0090     end
0091     
0092     %Get the unique values of each data type
0093     [hpaData.genes, ~, I]=unique(hpa{1});
0094     [~, J, K]=unique(strcat(hpa{2},'€',hpa{3}));
0095     hpaData.tissues=hpa{2}(J);
0096     hpaData.celltypes=hpa{3}(J);
0097     [hpaData.levels, ~, L]=unique(hpa{4});
0098     [hpaData.types, ~, M]=unique(hpa{5});
0099     [hpaData.reliabilities, ~, N]=unique(hpa{6});
0100     
0101     %Map the data to be sparse matrises instead
0102     hpaData.gene2Level=sparse(I,K,L,numel(hpaData.genes),numel(hpaData.tissues));
0103     hpaData.gene2Type=sparse(I,K,M,numel(hpaData.genes),numel(hpaData.tissues));
0104     hpaData.gene2Reliability=sparse(I,K,N,numel(hpaData.genes),numel(hpaData.tissues));
0105 end
0106 end

Generated by m2html © 2005