0001 function databases = loadDatabases(selectDatabase,modelAdapter)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020 if nargin<1
0021 selectDatabase = 'both';
0022 end
0023
0024 if nargin < 2 || isempty(modelAdapter)
0025 modelAdapter = ModelAdapterManager.getDefault();
0026 if isempty(modelAdapter)
0027 error('Either send in a modelAdapter or set the default model adapter in the ModelAdapterManager.')
0028 end
0029 end
0030
0031 params = modelAdapter.getParameters();
0032 kegg.ID = params.kegg.ID;
0033 uniprot.ID = params.uniprot.ID;
0034 filePath = fullfile(params.path,'data');
0035 uniprot.geneIDfield = params.uniprot.geneIDfield;
0036 uniprot.type = params.uniprot.type;
0037 if strcmp(uniprot.type,'taxonomy')
0038 uniprot.type = 'taxonomy_id';
0039 end
0040 kegg.geneID = params.kegg.geneID;
0041 if params.uniprot.reviewed
0042 uniprotRev = 'reviewed:true+AND+';
0043 else
0044 uniprotRev = '';
0045 end
0046
0047 warning('off', 'MATLAB:MKDIR:DirectoryExists');
0048
0049 databases.uniprot = [];
0050 databases.kegg = [];
0051
0052
0053 if any(strcmp(selectDatabase,{'uniprot','both'}))
0054 uniprotPath = fullfile(filePath,'uniprot.tsv');
0055 if ~exist(uniprotPath,'file')
0056 if isempty(uniprot.ID)
0057 printOrange('WARNING: No uniprot.ID is specified, unable to download UniProt DB.\n');
0058 end
0059 disp(['Downloading Uniprot data for ' uniprot.type ' ' uniprot.ID '. This can take a few minutes.'])
0060 url = ['https://rest.uniprot.org/uniprotkb/stream?query=' uniprotRev ...
0061 uniprot.type ':' num2str(uniprot.ID) '&fields=accession%2C' uniprot.geneIDfield ...
0062 '%2Cec%2Cmass%2Csequence&format=tsv&compressed=false&sort=protein_name%20asc'];
0063 try
0064 urlwrite(url,uniprotPath,'Timeout',30);
0065 fprintf('Model-specific UniProt database stored at %s\n',uniprotPath);
0066 catch
0067 error(['Download failed, check your internet connection and try again, or manually download: ' url ...
0068 ' After downloading, store the file as ' uniprotPath])
0069 end
0070 end
0071 if exist(uniprotPath,'file')
0072 fid = fopen(uniprotPath,'r');
0073 fileContent = textscan(fid,'%q %q %q %q %q','Delimiter','\t','HeaderLines',1);
0074 fclose(fid);
0075 databases.uniprot.ID = fileContent{1};
0076 databases.uniprot.genes = fileContent{2};
0077 databases.uniprot.eccodes = fileContent{3};
0078 databases.uniprot.MW = str2double(fileContent{4});
0079 databases.uniprot.seq = fileContent{5};
0080 else
0081 databases.uniprot = [];
0082 end
0083 if ~isempty(databases.uniprot)
0084 [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,'stable');
0085 if numel(uniqueIDs) < numel(databases.uniprot.ID)
0086 duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx);
0087 dispEM(['Duplicate entries are found for the following proteins. '...
0088 'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '...
0089 'in the model adapter:'],true,databases.uniprot.ID(duplID));
0090 end
0091 end
0092 end
0093
0094
0095 if any(strcmp(selectDatabase,{'kegg','both'}))
0096 keggPath = fullfile(filePath,'kegg.tsv');
0097 if ~exist(keggPath,'file')
0098 if isempty(kegg.ID)
0099 printOrange('WARNING: No kegg.ID is specified, unable to download KEGG DB.\n');
0100 else
0101 downloadKEGG(kegg.ID,keggPath,kegg.geneID);
0102 end
0103 end
0104 if exist(keggPath,'file')
0105 fid = fopen(keggPath,'r');
0106 fileContent = textscan(fid,'%q %q %q %q %q %q %q','Delimiter',',','HeaderLines',0);
0107 fclose(fid);
0108 databases.kegg.uniprot = fileContent{1};
0109 databases.kegg.genes = fileContent{2};
0110 databases.kegg.keggGene = fileContent{3};
0111 databases.kegg.eccodes = fileContent{4};
0112 databases.kegg.MW = str2double(fileContent{5});
0113 databases.kegg.pathway = fileContent{6};
0114 databases.kegg.seq = fileContent{7};
0115 else
0116 databases.kegg = [];
0117 end
0118 end
0119 end
0120
0121 function downloadKEGG(keggID, filePath, keggGeneID)
0122
0123 progressbar(['Downloading KEGG data for organism code ' keggID])
0124 webOptions = weboptions('Timeout',30);
0125 try
0126 gene_list = webread(['http://rest.kegg.jp/list/' keggID],webOptions);
0127 catch ME
0128 switch ME.identifier
0129 case 'MATLAB:webservices:HTTP400StatusCodeError'
0130 error(['Unable to download data form KEGG with a potentially invalid ID: ' keggID ])
0131 end
0132 end
0133 gene_list = regexpi(gene_list, '[^\n]+','match')';
0134 gene_id = regexpi(gene_list,['(?<=' keggID ':)\S+'],'match');
0135
0136
0137 genesPerQuery = 10;
0138 queries = ceil(numel(gene_id)/genesPerQuery);
0139 keggData = cell(numel(gene_id),1);
0140 for i = 1:queries
0141
0142 firstIdx = i*genesPerQuery-(genesPerQuery-1);
0143 lastIdx = i*genesPerQuery;
0144 if lastIdx > numel(gene_id)
0145 lastIdx = numel(gene_id);
0146 end
0147 url = ['http://rest.kegg.jp/get/' keggID ':' strjoin([gene_id{firstIdx:lastIdx}],['+' keggID ':'])];
0148
0149 retry = true;
0150 while retry
0151 try
0152 retry = false;
0153 out = webread(url,webOptions);
0154 catch
0155 retry = true;
0156 end
0157 end
0158 outSplit = strsplit(out,['///' 10]);
0159 if numel(outSplit) < lastIdx-firstIdx+2
0160 error('KEGG returns less genes per query')
0161 end
0162 keggData(firstIdx:lastIdx) = outSplit(1:end-1);
0163 progressbar(i/queries)
0164 end
0165
0166
0167 sequence = regexprep(keggData,'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*','$1');
0168
0169 noProt = startsWith(sequence,'ENTRY ');
0170 uni = regexprep(keggData,'.*UniProt: (\S+?)\s.*','$1');
0171 noUni = startsWith(uni,'ENTRY ');
0172 uni(noProt | noUni) = [];
0173 keggData(noProt | noUni) = [];
0174 sequence(noProt | noUni) = [];
0175 sequence = regexprep(sequence,'\s*','');
0176 keggGene = regexprep(keggData,'ENTRY\s+(\S+?)\s.+','$1');
0177
0178 switch keggGeneID
0179 case {'kegg',''}
0180 gene_name = keggGene;
0181 otherwise
0182
0183 keggGeneIDT = regexptranslate('escape',keggGeneID);
0184 gene_name = regexprep(keggData,['.+' keggGeneIDT ': (\S+?)\n.+'],'$1');
0185 noID = ~contains(keggData,keggGeneIDT);
0186 if all(noID)
0187 error(['None of the KEGG entries are annotated with the gene identifier ' keggGeneID])
0188 else
0189 gene_name(noID)= [];
0190 keggData(noID) = [];
0191 keggGene(noID) = [];
0192 sequence(noID) = [];
0193 uni(noID) = [];
0194 end
0195 end
0196
0197 EC_names = regexprep(keggData,'.*ORTHOLOGY.*\[EC:(.*?)\].*','$1');
0198 EC_names(startsWith(EC_names,'ENTRY ')) = {''};
0199
0200 MW = cell(numel(sequence),1);
0201 for i=1:numel(sequence)
0202 if ~isempty(sequence{i})
0203 MW{i} = num2str(round(calculateMW(sequence{i})));
0204 end
0205 end
0206
0207 pathway = regexprep(keggData,'.*PATHWAY\s+(.*?)(BRITE|MODULE).*','$1');
0208 pathway(startsWith(pathway,'ENTRY ')) = {''};
0209 pathway = strrep(pathway,[keggID '01100 Metabolic pathways'],'');
0210 pathway = regexprep(pathway,'\n','');
0211 pathway = regexprep(pathway,' ','');
0212
0213 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence];
0214 out = cell2table(out);
0215
0216 writetable(out, filePath, 'FileType', 'text', 'WriteVariableNames',false);
0217 fprintf('Model-specific KEGG database stored at %s\n',filePath);
0218 end