Description of scoreComplexModel

0001 function [rxnScores, geneScores, hpaScores, arrayScores] = scoreComplexModel(model,hpaData,arrayData,tissue,celltype,noGeneScore,isozymeScoring,complexScoring,multipleCellScoring,hpaLevelScores)
0002 % scoreComplexModel
0003 %   Scores the reactions and genes in a model containing complex gene rules
0004 %   based on expression data from HPA and/or gene arrays.
0005 %
0006 %   It is highly recommended that the model grRules are "cleaned" and
0007 %   verified by the "cleanModelGeneRules" function prior to scoring the
0008 %   model with scoreComplexModel.
0009 %
0010 %   model               a model structure
0011 %   hpaData             HPA data structure from parseHPA (optional if arrayData is
0012 %                       supplied, default [])
0013 %   arrayData           gene expression data structure (optional if hpaData is
0014 %                       supplied, default [])
0015 %       genes           cell array with the unique gene names
0016 %       tissues         cell array with the tissue names. The list may not be
0017 %                       unique, as there can be multiple cell types per tissue
0018 %       celltypes       cell array with the cell type names for each tissue
0019 %       levels          GENESxTISSUES array with the expression level for
0020 %                       each gene in each tissue/celltype. NaN should be
0021 %                       used when no measurement was performed
0022 %       threshold       a single value or a vector of gene expression
0023 %                       thresholds, above which genes are considered to be
0024 %                       "expressed". (optional, by default, the mean expression
0025 %                       levels of each gene across all tissues in arrayData
0026 %                       will be used as the threshold values)
0027 %   tissue              tissue to score for. Should exist in either
0028 %                       hpaData.tissues or arrayData.tissues
0029 %   celltype            cell type to score for. Should exist in either
0030 %                       hpaData.celltypes or arrayData.celltypes for this
0031 %                       tissue (optional, default is to use the max values
0032 %                       among all the cell types for the tissue. Use [] if
0033 %                       you want to supply more arguments)
0034 %   noGeneScore         score for reactions without genes (optional, default -2)
0035 %   isozymeScoring      determines how scores are calculated for reactions
0036 %                       with multiple genes joined by "OR" expression(s)
0037 %                       ('min', 'max', 'median', 'average')
0038 %                       (optional, default 'max')
0039 %   complexScoring      determines how scores are calculated for reactions
0040 %                       with multiple genes joined by "AND" expression(s)
0041 %                       ('min', 'max', 'median', 'average')
0042 %                       (optional, default 'min')
0043 %   multipleCellScoring determines how scores are calculated when several
0044 %                       cell types are used ('max' or 'average')
0045 %                       (optional, default 'max')
0046 %   hpaLevelScores      structure with numerical scores for the expression
0047 %                       level categories from HPA. The structure should have a
0048 %                       "names" and a "scores" field (optional, see code for
0049 %                       default scores)
0050 %
0051 %   rxnScores       scores for each of the reactions in model
0052 %   geneScores      scores for each of the genes in model. Genes which are
0053 %                   not in the dataset(s) have -Inf as scores
0054 %   hpaScores       scores for each of the genes in model if only taking hpaData
0055 %                   into account. Genes which are not in the dataset(s)
0056 %                   have -Inf as scores
0057 %   arrayScores     scores for each of the genes in model if only taking arrayData
0058 %                   into account. Genes which are not in the dataset(s)
0059 %                   have -Inf as scores
0060 %
0061 % Usage: [rxnScores, geneScores, hpaScores, arrayScores]=scoreComplexModel(...
0062 %               model,hpaData,arrayData,tissue,celltype,noGeneScore,isozymeScoring,
0063 %               complexScoring,multipleCellScoring,hpaLevelScores)
0064 %
0065 
0066 
0067 if nargin < 5
0068     celltype = [];
0069 end
0070 if nargin < 6 || isempty(noGeneScore)
0071     noGeneScore = -2;
0072 end
0073 if nargin < 7 || isempty(isozymeScoring)
0074     isozymeScoring = 'max';
0075 end
0076 if nargin < 8 || isempty(complexScoring)
0077     complexScoring = 'min';
0078 end
0079 if nargin < 9 || isempty(multipleCellScoring)
0080     multipleCellScoring = 'max';
0081 end
0082 if nargin < 10
0083     % The first four are for APE, the other ones for staining
0084     hpaLevelScores.names = {'High' 'Medium' 'Low' 'None' 'Strong' 'Moderate' 'Weak' 'Negative' 'Not detected'};
0085     hpaLevelScores.scores = [20 15 10 -8 20 15 10 -8 -8];
0086 end
0087 
0088 if isempty(hpaData) && isempty(arrayData)
0089     EM = 'Must supply hpaData, arrayData or both';
0090     dispEM(EM);
0091 end
0092 if ~ismember(lower(isozymeScoring),{'min','max','median','average'})
0093     EM = 'Valid options for isozymeScoring are "min", "max", "median", or "average"';
0094     dispEM(EM);
0095 end
0096 if ~ismember(lower(complexScoring),{'min','max','median','average'})
0097     EM = 'Valid options for complexScoring are "min", "max", "median", or "average"';
0098     dispEM(EM);
0099 end
0100 if ~ismember(lower(multipleCellScoring),{'max','average'})
0101     EM = 'Valid options for multipleCellScoring are "max" or "average"';
0102     dispEM(EM);
0103 end
0104 
0105 
0106 % Throw an error if array data for only one tissue is supplied without
0107 % specifying threshold values
0108 if ~isempty(arrayData)
0109     if numel(unique(arrayData.tissues)) < 2
0110         if ~isfield(arrayData,'threshold') || isempty(arrayData.threshold)
0111             EM = 'arrayData must contain measurements for at least two celltypes/tissues since the score is calculated based on the expression level compared to the overall average';
0112             dispEM(EM);
0113         end
0114     end
0115 end
0116 
0117 % Process arrayData.threshold if necessary
0118 if isfield(arrayData,'threshold') && (numel(arrayData.threshold) == 1)
0119     % if only a single gene threshold value is provided, then just
0120     % duplicate this value for all genes.
0121     arrayData.threshold = arrayData.threshold*ones(size(arrayData.genes));
0122 end
0123 
0124 % This is so the code can ignore which combination of input data is used
0125 if isempty(arrayData)
0126     arrayData.genes={};
0127     arrayData.tissues={};
0128     arrayData.celltypes={};
0129     arrayData.levels=[];
0130     arrayData.threshold=[];
0131 end
0132 if isempty(hpaData)
0133     hpaData.genes={};
0134     hpaData.tissues={};
0135     hpaData.celltypes={};
0136     hpaData.levels={};
0137     hpaData.types={};
0138     hpaData.reliabilities={};
0139     hpaData.gene2Level=[];
0140     hpaData.gene2Type=[];
0141     hpaData.gene2Reliability=[];
0142 end
0143 
0144 % Check that the tissue exists
0145 if ~ismember(upper(tissue),upper(hpaData.tissues)) && ~ismember(upper(tissue),upper(arrayData.tissues))
0146     EM = 'The tissue name does not match';
0147     dispEM(EM);
0148 end
0149 if any(celltype)
0150     % Check that both data types has cell type defined if that is to be used
0151     if ~isfield(hpaData,'celltypes') || ~isfield(arrayData,'celltypes')
0152         EM = 'Both hpaData and arrayData must contain cell type information if cell type is to be used';
0153         dispEM(EM);
0154     end
0155     if ~ismember(upper(celltype),upper(hpaData.celltypes)) && ~ismember(upper(celltype),upper(arrayData.celltypes))
0156         EM = 'The cell type name does not match';
0157         dispEM(EM);
0158     end
0159 end
0160 
0161 % Some preprocessing of the structures to increase efficiency
0162 % Remove all tissues that are not the correct one
0163 J = ~strcmpi(hpaData.tissues,tissue);
0164 
0165 %If cell type is supplied, then only keep that cell type
0166 if any(celltype)
0167     J = J | ~strcmpi(hpaData.celltypes,celltype);
0168 end
0169 
0170 hpaData.tissues(J)=[];
0171 if isfield(hpaData,'celltypes')
0172     hpaData.celltypes(J)=[];
0173 end
0174 if isfield(hpaData,'gene2Level')
0175     hpaData.gene2Level(:,J)=[];
0176 end
0177 if isfield(hpaData,'gene2Type')
0178     hpaData.gene2Type(:,J)=[];
0179 end
0180 if isfield(hpaData,'gene2Reliability')
0181     hpaData.gene2Reliability(:,J)=[];
0182 end
0183 
0184 % Remove all genes from the structures that are not in model or that aren't
0185 % measured in the tissue
0186 if ~isempty(hpaData.genes) % This should not be necessary, but the summation is a 0x1 matrix and the other is []
0187     I = ~ismember(hpaData.genes,model.genes) | sum(hpaData.gene2Level,2) == 0;
0188 else
0189     I = [];
0190 end
0191 hpaData.genes(I) = [];
0192 if isfield(hpaData,'gene2Level')
0193     hpaData.gene2Level(I,:) = [];
0194 end
0195 if isfield(hpaData,'gene2Type')
0196     hpaData.gene2Type(I,:) = [];
0197 end
0198 if isfield(hpaData,'gene2Reliability')
0199     hpaData.gene2Reliability(I,:) = [];
0200 end
0201 
0202 I = strcmpi(arrayData.tissues,tissue);
0203 % If cell type is supplied, then only keep that cell type
0204 if any(celltype)
0205     I = I & strcmpi(arrayData.celltypes,celltype);
0206 end
0207 
0208 % Remove all genes from the structures that are not in model or that aren't
0209 % measured in the tissue
0210 J = ~ismember(arrayData.genes,model.genes) | all(isnan(arrayData.levels(:,I)),2);
0211 arrayData.genes(J) = [];
0212 arrayData.levels(J,:) = [];
0213 if isfield(arrayData,'threshold')
0214     arrayData.threshold(J) = [];
0215 end
0216 
0217 % Calculate the scores for the arrayData. These scores are calculated for
0218 % each genes from its fold change between the tissue/celltype(s) in question
0219 % and all other celltypes, or the threshold if supplied. This is a lower
0220 % quality data than protein abundance, since gene abundance is an indirect
0221 % estimate of protein level. These scores are therefore only used for genes
0222 % for which there is no HPA data available. The fold changes are transformed
0223 % as min(5*log(x),10) for x > 1 and max(5*log(x),-5) for x < 1 in order to
0224 % have negative scores for lower expressed genes and to scale the scores
0225 % to have somewhat lower weights than the HPA scores
0226 tempArrayLevels = arrayData.levels;
0227 tempArrayLevels(isnan(tempArrayLevels)) = 0;
0228 if isfield(arrayData,'threshold') && ~isempty(arrayData.threshold)
0229     % if provided, the user-supplied expression threshold value(s) will be
0230     % used as the "average" expression level to which each gene is compared
0231     average = arrayData.threshold;
0232 else
0233     average = sum(tempArrayLevels,2)./sum(~isnan(arrayData.levels),2);
0234 end
0235 if strcmpi(multipleCellScoring,'max')
0236     current = max(tempArrayLevels(:,I),[],2);
0237 else
0238     current = sum(tempArrayLevels(:,I),2)./sum(~isnan(arrayData.levels(:,I)),2);
0239 end
0240 if ~isempty(current)
0241     aScores = 5*log(current./average);
0242 else
0243     aScores = [];
0244 end
0245 aScores(aScores > 0) = min(aScores(aScores > 0),10);
0246 aScores(aScores < 0) = max(aScores(aScores < 0),-5);
0247 aScores(isnan(aScores)) = -5;  % NaNs occur when gene expression is zero across all tissues
0248 
0249 % Map the HPA levels to scores
0250 [I, J] = ismember(upper(hpaData.levels),upper(hpaLevelScores.names));
0251 if ~all(I)
0252     EM = 'There are expression level categories that do not match to hpaLevelScores';
0253     dispEM(EM);
0254 end
0255 [K, L, M] = find(hpaData.gene2Level);
0256 scores = hpaLevelScores.scores(J);
0257 if strcmpi(multipleCellScoring,'max')
0258     hScores = max(sparse(K,L,scores(M),numel(hpaData.genes),numel(hpaData.tissues)),[],2);
0259 else
0260     hScores = mean(sparse(K,L,scores(M),numel(hpaData.genes),numel(hpaData.tissues)),2);
0261 end
0262 
0263 % Assign gene scores, prioritizing HPA (protein) data over arrayData (RNA)
0264 geneScores = nan(numel(model.genes),1);
0265 hpaScores = -Inf(numel(model.genes),1);
0266 arrayScores = -Inf(numel(model.genes),1);
0267 
0268 [I, J] = ismember(model.genes,arrayData.genes);
0269 geneScores(I) = aScores(J(I));
0270 arrayScores(I) = aScores(J(I));
0271 
0272 [I, J] = ismember(model.genes,hpaData.genes);
0273 geneScores(I) = hScores(J(I));
0274 hpaScores(I) = hScores(J(I));
0275 
0276 
0277 % To speed things up, only need to score each unique grRule once
0278 [uRules,~,rule_ind] = unique(model.grRules);
0279 uScores = nan(size(uRules));
0280 
0281 % convert logic operators to symbols
0282 uRules = regexprep(uRules,' and ',' & ');
0283 uRules = regexprep(uRules,' or ',' | ');
0284 
0285 % score based on presence/combination of & and | operators
0286 for i = 1:numel(uRules)
0287     if isempty(uRules{i})
0288         uScores(i) = noGeneScore;
0289     elseif contains(uRules{i},'&') && contains(uRules{i},'|')
0290         uScores(i) = scoreComplexRule(uRules{i},model.genes,geneScores,isozymeScoring,complexScoring);
0291     else
0292         uScores(i) = scoreSimpleRule(uRules{i},model.genes,geneScores,isozymeScoring,complexScoring);
0293     end
0294 end
0295 
0296 % NaN reaction scores should be changed to the no-gene score
0297 uScores(isnan(uScores)) = noGeneScore;
0298 
0299 % re-map unique rule scores to model
0300 rxnScores = uScores(rule_ind);
0301 
0302 end
0303 
0304 
0305 
0306 function rScore = scoreSimpleRule(rule,genes,gScores,isozymeScoring,complexScoring)
0307 %Score reactions with simple gene rules (those with all ANDs or all ORs).
0308 
0309 if ~contains(rule,'&')
0310     scoreMethod = isozymeScoring;
0311 elseif ~contains(rule,'|')
0312     scoreMethod = complexScoring;
0313 else
0314     error('This function cannot handle complex gene rules.');
0315 end
0316 ruleGenes = unique(regexp(rule,'[^&|\(\) ]+','match'));
0317 [~,geneInd] = ismember(ruleGenes,genes);
0318 switch lower(scoreMethod)
0319     case 'min'
0320         rScore = min(gScores(geneInd),[],'omitnan');
0321     case 'max'
0322         rScore = max(gScores(geneInd),[],'omitnan');
0323     case 'median'
0324         rScore = median(gScores(geneInd),'omitnan');
0325     case 'average'
0326         rScore = mean(gScores(geneInd),'omitnan');
0327 end
0328 
0329 end
0330 
0331 
0332 
0333 function rScore = scoreComplexRule(rule,genes,gScores,isozymeScoring,complexScoring)
0334 %Score reactions with complex gene rules (those with both ANDs or ORs).
0335 
0336 % Specify phrases to search for in the grRule. These phrases will find
0337 % genes grouped by all ANDs (first phrase) or all ORs (second phrase).
0338 search_phrases = {'\([^&|\(\) ]+( & [^&|\(\) ]+)+\)', '\([^&|\(\) ]+( \| [^&|\(\) ]+)+\)'};
0339 
0340 % initialize some variables
0341 subsets = {};  % subsets are groups of genes grouped by all ANDs or all ORs
0342 c = 1;  % counter to keep track of the group (subset) number
0343 r_orig = rule;  % record original rule to determine when it stops changing
0344 for k = 1:100  % iterate some arbitrarily high number of times
0345     for j = 1:length(search_phrases)
0346         new_subset = regexp(rule,search_phrases{j},'match')';  % extract subsets
0347         if ~isempty(new_subset)
0348             subsets = [subsets; new_subset];  % append to list of subsets
0349             subset_nums = arrayfun(@num2str,(c:length(subsets))','UniformOutput',false);  % get group numbers to be assigned to the new subsets, and convert to strings
0350             rule = regexprep(rule,search_phrases{j},strcat('#',subset_nums,'#'),'once');  % replace the subsets in the expression with their group numbers (enclosed by "#"s)
0351             c = c + length(new_subset);
0352         end
0353     end
0354     if isequal(rule,r_orig)
0355         break;  % stop iterating when rule stops changing
0356     else
0357         r_orig = rule;
0358     end
0359 end
0360 subsets{end+1} = rule;  % add final state of rule as the last subset
0361 
0362 % score each subset and append to gene list and gene scores
0363 for i = 1:numel(subsets)
0364     gScores = [gScores; scoreSimpleRule(subsets{i},genes,gScores,isozymeScoring,complexScoring)];
0365     genes = [genes; {strcat('#',num2str(i),'#')}];
0366 end
0367 
0368 % the final subset score is the overall reaction score
0369 rScore = gScores(end);
0370 
0371 end
0372
scoreComplexModel

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE