0001 function [elements, useMat, exitFlag, MW]=parseFormulas(formulas, noPolymers,isInchi,ignoreRX)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 if nargin<2
0034 noPolymers=false;
0035 end
0036 if nargin<3
0037 isInchi=false;
0038 end
0039 if nargin<4
0040 ignoreRX=false;
0041 end
0042
0043 elements.abbrevs={'C', 'N', 'O', 'S', 'P', 'H', 'He', 'Li', 'Be', 'B', 'F', 'Ne', 'Na', 'Mg', 'Al',...
0044 'Si', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni',...
0045 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc',...
0046 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce',...
0047 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',...
0048 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra',...
0049 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',...
0050 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'R', 'X'}';
0051 elements.names={'carbon', 'nitrogen', 'oxygen', 'sulfur', 'phosphorus', 'hydrogen', 'helium', 'lithium', 'beryllium', 'boron',...
0052 'fluorine', 'neon', 'sodium', 'magnesium', 'aluminum,', 'silicon',...
0053 'chlorine', 'argon', 'potassium', 'calcium', 'scandium', 'titanium', 'vanadium',...
0054 'chromium', 'manganese', 'iron', 'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium',...
0055 'arsenic', 'selenium', 'bromine', 'krypton', 'rubidium', 'strontium', 'yttrium', 'zirconium',...
0056 'niobium', 'molybdenum', 'technetium', 'ruthenium', 'rhodium', 'palladium', 'silver', 'cadmium',...
0057 'indium', 'tin', 'antimony', 'tellurium', 'iodine', 'xenon', 'cesium', 'barium', 'lanthanum',...
0058 'cerium', 'praseodymium', 'neodymium', 'promethium', 'samarium', 'europium', 'gadolinium',...
0059 'terbium', 'dysprosium', 'holmium', 'erbium', 'thulium', 'ytterbium', 'lutetium', 'hafnium',...
0060 'tantalum', 'tungsten', 'rhenium', 'osmium', 'iridium', 'platinum', 'gold', 'mercury',...
0061 'thallium', 'lead', 'bismuth', 'polonium', 'astatine', 'radon', 'francium', 'radium',...
0062 'actinium', 'thorium', 'protactinium', 'uranium', 'neptunium', 'plutonium', 'americium',...
0063 'curium', 'berkelium', 'californium', 'einsteinium', 'fermium', 'mendelevium', 'nobelium',...
0064 'lawrencium', 'rutherfordium', 'dubnium', 'seaborgium', 'bohrium', 'hassium', 'meitnerium',...
0065 'darmstadtium', 'roentgenium', 'copernicium', 'generic group', 'bound protein'}';
0066
0067 EWs=[12.0107 14.0067 15.9994 32.065 30.973762 1.00794 4.002602 6.941 9.012182 10.811 18.9984032 ...
0068 20.1797 22.98976928 24.305 26.9815386 28.0855 35.453 39.948 39.0983 40.078 44.955912 47.867 50.9415 ...
0069 51.9961 54.938045 55.845 58.933195 58.6934 63.546 65.39 69.723 72.64 74.9216 78.96 79.904 83.798 ...
0070 85.4678 87.62 88.90585 91.224 92.906 95.94 97.9072 101.07 102.905 106.42 107.8682 112.411 114.818 ...
0071 118.71 121.76 127.6 126.904 131.293 132.9054519 137.327 138.90547 140.116 140.90765 144.242 144.9127 ...
0072 150.36 151.964 157.25 158.92535 162.5 164.93 167.259 168.93421 173.04 174.967 178.49 180.94788 183.84 ...
0073 186.207 190.23 192.217 195.084 196.966569 200.59 204.3833 207.2 208.9804 208.9824 209.9871 222.0176 ...
0074 223.0197 226.0254 227.0277 232.03806 231.03588 238.02891 237.0482 244.0642 243.0614 247.0704 247.0703 ...
0075 251.0796 252.083 257.0951 258.0984 259.101 262.1097 261.1088 262 266 264 277 268 271 272 nan nan nan]';
0076
0077
0078 if ignoreRX==true
0079 EWs(end-1:end)=0;
0080 end
0081
0082 useMat=zeros(numel(formulas),numel(elements.abbrevs));
0083
0084 exitFlag=zeros(numel(formulas),1);
0085
0086
0087
0088 formulas=strrep(formulas,'p+1','H+');
0089
0090
0091 formulas=strrep(formulas,'+','');
0092 formulas=strrep(formulas,'-','');
0093
0094
0095 for i=1:numel(formulas)
0096 if ~isempty(formulas{i})
0097 sucess=false;
0098 formula=formulas{i};
0099
0100
0101
0102
0103 if isInchi==true
0104 S=regexp(formula,'/','split');
0105 if numel(S)>=2
0106 formula=S{2};
0107 else
0108 formula='';
0109 end
0110 end
0111
0112
0113 if isInchi==false
0114 LP=strfind(formula,'(');
0115 RP=strfind(formula,')n');
0116
0117
0118
0119
0120
0121 if numel(LP)==1 && numel(RP)==1
0122
0123
0124 if noPolymers==true
0125
0126 formula=strrep(formula,'(','');
0127 formula=strrep(formula,')n','');
0128 else
0129
0130 exitFlag(i)=-1;
0131 continue;
0132 end
0133 else
0134 if ~isempty(LP) || ~isempty(RP)
0135 exitFlag(i)=-1;
0136 continue;
0137 end
0138 end
0139 end
0140
0141
0142 nonNumeric=false(numel(formula),1);
0143 nonNumeric(regexp(formula,'[^0-9.]'))=true;
0144
0145
0146
0147 upperI=isstrprop(formula,'upper');
0148 upperX=find(upperI);
0149
0150 for j=1:numel(upperX)
0151
0152
0153 isLast=false;
0154 if upperX(j)==numel(formula)
0155 coeff=1;
0156 element=formula(upperX(j));
0157 isLast=true;
0158 end
0159
0160 if isLast==false
0161
0162
0163 if nonNumeric(upperX(j)+1)
0164
0165 if upperI(upperX(j)+1)
0166
0167
0168 coeff=1;
0169 element=formula(upperX(j));
0170 else
0171
0172
0173 if j==numel(upperX)
0174 if upperX(j)<numel(formula)-1
0175 coeff=str2double(formula(upperX(j)+2:end));
0176 else
0177 coeff=1;
0178 end
0179 else
0180
0181
0182 if nonNumeric(upperX(j)+2)==true
0183 coeff=1;
0184 else
0185 coeff=str2double(formula(upperX(j)+2:upperX(j+1)-1));
0186 end
0187 end
0188 element=formula(upperX(j):upperX(j)+1);
0189 end
0190 else
0191
0192 if j==numel(upperX)
0193 coeff=str2double(formula(upperX(j)+1:end));
0194 else
0195 coeff=str2double(formula(upperX(j)+1:upperX(j+1)-1));
0196 end
0197 element=formula(upperX(j));
0198 end
0199 end
0200
0201
0202 I=strcmp(element,elements.abbrevs);
0203 if any(I)
0204 if ~isnan(coeff)
0205 useMat(i,I)=useMat(i,I)+coeff;
0206 sucess=true;
0207 else
0208 break;
0209 end
0210 else
0211 break;
0212 end
0213 end
0214 if sucess==false
0215 useMat(i,:)=0;
0216 exitFlag(i)=-1;
0217 else
0218 exitFlag(i)=1;
0219 end
0220 end
0221 end
0222
0223
0224 I=~any(useMat);
0225 useMat(:,I)=[];
0226 elements.abbrevs(I)=[];
0227 elements.names(I)=[];
0228 EWs(I)=[];
0229
0230
0231
0232
0233 if nargout>3
0234 P=bsxfun(@times,useMat(:,~isnan(EWs)),EWs(~isnan(EWs)).');
0235 MW=sum(P,2);
0236
0237
0238 I=find(useMat(:,isnan(EWs)));
0239 MW(I)=nan;
0240 MW(exitFlag~=1)=nan;
0241 end
0242 end