Home > src > geckomat > get_enzyme_data > loadDatabases.m

loadDatabases

PURPOSE ^

loadDatabases

SYNOPSIS ^

function databases = loadDatabases(selectDatabase,modelAdapter)

DESCRIPTION ^

 loadDatabases
   Loads (and downloads if necessary) the organism-specific KEGG and
   UniProt databases that are required to extract protein information. The
   uniprot.ID and kegg.ID are taken from the ModelAdapter.

 Input:
   selectDatabase  which databases should be loaded, either 'uniprot',
                   'kegg' or 'both' (optional, default 'both')
   modelAdapter    Model adapter. Optional, default will use the default 
                   model adapter (send in [] for default).

 Output:
   databases       contains .uniprot and .kegg structures, dependent on
                   which databases were selected.

 Usage:
   databases = loadDatabases(selectDatabase,modelAdapter)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function databases = loadDatabases(selectDatabase,modelAdapter)
0002 % loadDatabases
0003 %   Loads (and downloads if necessary) the organism-specific KEGG and
0004 %   UniProt databases that are required to extract protein information. The
0005 %   uniprot.ID and kegg.ID are taken from the ModelAdapter.
0006 %
0007 % Input:
0008 %   selectDatabase  which databases should be loaded, either 'uniprot',
0009 %                   'kegg' or 'both' (optional, default 'both')
0010 %   modelAdapter    Model adapter. Optional, default will use the default
0011 %                   model adapter (send in [] for default).
0012 %
0013 % Output:
0014 %   databases       contains .uniprot and .kegg structures, dependent on
0015 %                   which databases were selected.
0016 %
0017 % Usage:
0018 %   databases = loadDatabases(selectDatabase,modelAdapter)
0019 
0020 if nargin<1
0021     selectDatabase = 'both';
0022 end
0023 
0024 if nargin < 2 || isempty(modelAdapter)
0025     modelAdapter = ModelAdapterManager.getDefault();
0026     if isempty(modelAdapter)
0027         error('Either send in a modelAdapter or set the default model adapter in the ModelAdapterManager.')
0028     end
0029 end
0030 
0031 params      = modelAdapter.getParameters();
0032 kegg.ID      = params.kegg.ID;
0033 uniprot.ID   = params.uniprot.ID;
0034 filePath    = fullfile(params.path,'data');
0035 uniprot.geneIDfield = params.uniprot.geneIDfield;
0036 uniprot.type = params.uniprot.type;
0037 if strcmp(uniprot.type,'taxonomy')
0038     uniprot.type = 'taxonomy_id';
0039 end
0040 kegg.geneID = params.kegg.geneID;
0041 if params.uniprot.reviewed
0042     uniprotRev = 'reviewed:true+AND+';
0043 else
0044     uniprotRev = '';
0045 end
0046 
0047 warning('off', 'MATLAB:MKDIR:DirectoryExists');
0048 
0049 databases.uniprot = [];
0050 databases.kegg = [];
0051 
0052 %% Uniprot
0053 if any(strcmp(selectDatabase,{'uniprot','both'}))
0054     uniprotPath = fullfile(filePath,'uniprot.tsv');
0055     if ~exist(uniprotPath,'file')
0056         if isempty(uniprot.ID)
0057             printOrange('WARNING: No uniprot.ID is specified, unable to download UniProt DB.\n');
0058         end
0059         disp(['Downloading Uniprot data for ' uniprot.type ' ' uniprot.ID '. This can take a few minutes.'])
0060         url = ['https://rest.uniprot.org/uniprotkb/stream?query=' uniprotRev ...
0061                uniprot.type ':' num2str(uniprot.ID) '&fields=accession%2C' uniprot.geneIDfield ...
0062             '%2Cec%2Cmass%2Csequence&format=tsv&compressed=false&sort=protein_name%20asc'];
0063         try
0064             urlwrite(url,uniprotPath,'Timeout',30);
0065             fprintf('Model-specific UniProt database stored at %s\n',uniprotPath);
0066         catch
0067             error(['Download failed, check your internet connection and try again, or manually download: ' url ...
0068                 ' After downloading, store the file as ' uniprotPath])
0069         end
0070     end
0071     if exist(uniprotPath,'file')
0072         fid         = fopen(uniprotPath,'r');
0073         fileContent = textscan(fid,'%q %q %q %q %q','Delimiter','\t','HeaderLines',1);
0074         fclose(fid);
0075         databases.uniprot.ID      = fileContent{1};
0076         databases.uniprot.genes   = fileContent{2};
0077         databases.uniprot.eccodes = fileContent{3};
0078         databases.uniprot.MW      = str2double(fileContent{4});
0079         databases.uniprot.seq     = fileContent{5};
0080     else
0081         databases.uniprot = [];
0082     end
0083     if ~isempty(databases.uniprot)
0084         [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,'stable');
0085         if numel(uniqueIDs) < numel(databases.uniprot.ID)
0086             duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx);
0087             dispEM(['Duplicate entries are found for the following proteins. '...
0088                     'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '...
0089                     'in the model adapter:'],true,databases.uniprot.ID(duplID));
0090         end
0091     end
0092 end
0093 
0094 %% KEGG
0095 if any(strcmp(selectDatabase,{'kegg','both'}))
0096     keggPath = fullfile(filePath,'kegg.tsv');
0097     if ~exist(keggPath,'file')
0098         if isempty(kegg.ID)
0099             printOrange('WARNING: No kegg.ID is specified, unable to download KEGG DB.\n');
0100         else
0101             downloadKEGG(kegg.ID,keggPath,kegg.geneID);
0102         end
0103     end
0104     if exist(keggPath,'file')
0105         fid         = fopen(keggPath,'r');
0106         fileContent = textscan(fid,'%q %q %q %q %q %q %q','Delimiter',',','HeaderLines',0);
0107         fclose(fid);
0108         databases.kegg.uniprot    = fileContent{1};
0109         databases.kegg.genes      = fileContent{2};
0110         databases.kegg.keggGene   = fileContent{3};
0111         databases.kegg.eccodes    = fileContent{4};
0112         databases.kegg.MW         = str2double(fileContent{5});
0113         databases.kegg.pathway    = fileContent{6};
0114         databases.kegg.seq        = fileContent{7};
0115     else
0116         databases.kegg = [];
0117     end
0118 end
0119 end
0120 
0121 function downloadKEGG(keggID, filePath, keggGeneID)
0122 %% Download gene information
0123 progressbar(['Downloading KEGG data for organism code ' keggID])
0124 webOptions = weboptions('Timeout',30);
0125 try
0126     gene_list = webread(['http://rest.kegg.jp/list/' keggID],webOptions);
0127 catch ME
0128     switch ME.identifier
0129         case 'MATLAB:webservices:HTTP400StatusCodeError'
0130             error(['Unable to download data form KEGG with a potentially invalid ID: ' keggID ])
0131     end
0132 end
0133 gene_list = regexpi(gene_list, '[^\n]+','match')';
0134 gene_id   = regexpi(gene_list,['(?<=' keggID ':)\S+'],'match');
0135 
0136 % Retrieve information for every gene in the list, 10 genes per query
0137 genesPerQuery = 10;
0138 queries = ceil(numel(gene_id)/genesPerQuery);
0139 keggData  = cell(numel(gene_id),1);
0140 for i = 1:queries
0141     % Download batches of genes
0142     firstIdx = i*genesPerQuery-(genesPerQuery-1);
0143     lastIdx  = i*genesPerQuery;
0144     if lastIdx > numel(gene_id) % Last query has probably less genes
0145         lastIdx = numel(gene_id);
0146     end
0147     url      = ['http://rest.kegg.jp/get/' keggID ':' strjoin([gene_id{firstIdx:lastIdx}],['+' keggID ':'])];
0148 
0149     retry = true;
0150     while retry
0151         try
0152             retry = false;
0153             out   = webread(url,webOptions);
0154         catch
0155             retry = true;
0156         end
0157     end
0158     outSplit = strsplit(out,['///' 10]); %10 is new line character
0159     if numel(outSplit) < lastIdx-firstIdx+2
0160         error('KEGG returns less genes per query') %Reduce genesPerQuery
0161     end
0162     keggData(firstIdx:lastIdx) = outSplit(1:end-1);
0163     progressbar(i/queries)
0164 end
0165 
0166 %% Parsing of info to keggDB format
0167 sequence  = regexprep(keggData,'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*','$1');
0168 %No AASEQ -> no protein -> not of interest
0169 noProt    = startsWith(sequence,'ENTRY ');
0170 uni       = regexprep(keggData,'.*UniProt: (\S+?)\s.*','$1');
0171 noUni     = startsWith(uni,'ENTRY ');
0172 uni(noProt | noUni)       = [];
0173 keggData(noProt | noUni) = [];
0174 sequence(noProt | noUni)  = [];
0175 sequence  = regexprep(sequence,'\s*','');
0176 keggGene  = regexprep(keggData,'ENTRY\s+(\S+?)\s.+','$1');
0177 
0178 switch keggGeneID
0179     case {'kegg',''}
0180         gene_name = keggGene;
0181     otherwise
0182         % In case there are special characters:
0183         keggGeneIDT = regexptranslate('escape',keggGeneID);
0184         gene_name = regexprep(keggData,['.+' keggGeneIDT ': (\S+?)\n.+'],'$1');
0185         noID = ~contains(keggData,keggGeneIDT);
0186         if all(noID)
0187             error(['None of the KEGG entries are annotated with the gene identifier ' keggGeneID])
0188         else
0189             gene_name(noID)= [];
0190             keggData(noID) = [];
0191             keggGene(noID) = [];
0192             sequence(noID) = [];
0193             uni(noID)      = [];
0194         end
0195 end
0196 
0197 EC_names  = regexprep(keggData,'.*ORTHOLOGY.*\[EC:(.*?)\].*','$1');
0198 EC_names(startsWith(EC_names,'ENTRY ')) = {''};
0199 
0200 MW = cell(numel(sequence),1);
0201 for i=1:numel(sequence)
0202     if ~isempty(sequence{i})
0203         MW{i} = num2str(round(calculateMW(sequence{i})));
0204     end
0205 end
0206 
0207 pathway   = regexprep(keggData,'.*PATHWAY\s+(.*?)(BRITE|MODULE).*','$1');
0208 pathway(startsWith(pathway,'ENTRY ')) = {''};
0209 pathway   = strrep(pathway,[keggID '01100  Metabolic pathways'],'');
0210 pathway   = regexprep(pathway,'\n','');
0211 pathway   = regexprep(pathway,'           ','');
0212 
0213 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence];
0214 out = cell2table(out);
0215 
0216 writetable(out, filePath, 'FileType', 'text', 'WriteVariableNames',false);
0217 fprintf('Model-specific KEGG database stored at %s\n',filePath);
0218 end

Generated by m2html © 2005