0001 function constructMultiFasta(model,sourceFile,outputDir)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017 sourceFile=char(sourceFile);
0018 outputDir=char(outputDir);
0019 if ~isfile(sourceFile)
0020 error('FASTA file %s cannot be found',string(sourceFile));
0021 end
0022
0023 fprintf('Scanning the source multi-FASTA file... ');
0024
0025 fid = fopen(sourceFile, 'r');
0026
0027
0028 hTable = java.util.Hashtable;
0029
0030 for i=1:numel(model.genes)
0031 hTable.put(model.genes{i}, i);
0032 end
0033
0034
0035
0036 elementPositions=zeros(5000000,1,'int64');
0037 totalElements=0;
0038 whereAmI=0;
0039 while 1
0040
0041 str=fread(fid,10000000,'int8');
0042
0043
0044 newPosts=find(str==62);
0045
0046 elementPositions(totalElements+1:totalElements+numel(newPosts))=whereAmI+newPosts;
0047
0048 totalElements=totalElements+numel(newPosts);
0049
0050 whereAmI=whereAmI+10000000;
0051
0052 if feof(fid)
0053 break;
0054 end
0055 end
0056 elementPositions=elementPositions(1:totalElements);
0057 fprintf('COMPLETE\n');
0058
0059 fprintf(['NOTICE: If Matlab is freezing and does not provide any output in 30 minutes, consider increasing Java Heap Memory\n', ...
0060 'in MATLAB settings and start over with the new session\n']);
0061 fprintf('Mapping genes to the multi-FASTA source file... 0%% complete');
0062
0063
0064
0065 genePositions=zeros(numel(model.genes),1);
0066 for i=1:numel(elementPositions)
0067 fseek(fid,elementPositions(i),-1);
0068 str=fread(fid,[1 30],'*char');
0069 delim=find(str==32 | str==10,1,'first');
0070
0071 geneIdentifier=str(1:delim-1);
0072
0073
0074
0075
0076 if isempty(geneIdentifier)
0077 continue;
0078 end
0079
0080
0081 if isempty(delim)
0082 EM='Too long gene identifier, increase read length';
0083 dispEM(EM);
0084 end
0085
0086
0087 id=hTable.get(geneIdentifier);
0088
0089 if any(id)
0090 if genePositions(id)==0
0091 genePositions(id)=i;
0092 end
0093 end
0094
0095 if rem(i-1,350000) == 0
0096 progress=num2str(floor(100*i/numel(elementPositions)));
0097 progress=pad(progress,3,'left');
0098 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress);
0099 end
0100 end
0101 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n');
0102
0103 fprintf('Generating the KEGG Orthology specific multi-FASTA files... 0%% complete');
0104
0105 for i=1:numel(model.rxns)
0106
0107
0108 if ~exist(fullfile(outputDir,[model.rxns{i} '.fa']), 'file')
0109
0110
0111 genesUsed=model.rxnGeneMat(i,:);
0112
0113
0114
0115 rxnfid=fopen(fullfile(outputDir,[model.rxns{i} '.fa']),'w');
0116
0117 if any(genesUsed)
0118 positions=genePositions(genesUsed~=0);
0119
0120
0121
0122 positions(positions==0)=[];
0123
0124
0125 for j=1:numel(positions)
0126 fseek(fid,elementPositions(positions(j)),-1);
0127
0128 if positions(j)<numel(elementPositions)
0129 str=fread(fid,[1 double(elementPositions(positions(j)+1))-double(elementPositions(positions(j)))-1],'*char');
0130
0131
0132 if str(end)~=10
0133 str=[str fread(fid,[1 double(elementPositions(positions(j)+2))-double(elementPositions(positions(j)+1))],'*char')];
0134
0135
0136
0137
0138 if str(end)~=10
0139
0140 continue;
0141 end
0142 end
0143 else
0144 str=fread(fid,[1 inf],'*char');
0145 end
0146 fwrite(rxnfid,['>' str]);
0147 end
0148 end
0149 fclose(rxnfid);
0150 end
0151
0152 if rem(i-1,50) == 0
0153 progress=num2str(floor(100*i/numel(model.rxns)));
0154 progress=pad(progress,3,'left');
0155 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress);
0156 end
0157 end
0158 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n');
0159
0160
0161 fclose(fid);
0162 end