Home > core > parseFormulas.m

parseFormulas

PURPOSE ^

parseFormulas

SYNOPSIS ^

function [elements, useMat, exitFlag, MW]=parseFormulas(formulas, noPolymers,isInchi,ignoreRX)

DESCRIPTION ^

 parseFormulas
   Gets the elemental composition from formulas

   formulas      a cell array with formulas
   noPolymers    assume that all polymers consist of one element.
                 Corresponds to counting everything between (...)n as
                 n being equal to one. Only one set of parentheses
                 is allowed. If this is false then polymers are returned as
                 "Could not parse formula" (optional, default false)
   isInchi       true if the formulas are in the InChI format (optional,
                 default false)
   ignoreRX      ignore R-groups and bound protein. This can be useful since they
                 are often used only as intermediates (optional, default false)

   elements
       abbrevs   cell array with abbreviations for all used elements
       names     cell array with the names for all used elements
   useMat        MxN matrix with the number of atoms for each formula (M) and each
                 element (N)
   exitFlag      array with the exit flags:
                 1=  Sucessful parsing
                 0=  No formula found
                 -1= Could not parse formula
   MW            predicted molecular weight (g/mol). This is only returned
                 for formulas which can be sucessfully parsed, and its
                 calculation doesn't affect the exitFlag variable. NaN is
                 returned if the weight couldn't be calculated
   
 Usage: [elements, useMat, exitFlag, MW]=
               parseFormulas(formulas, noPolymers,isInchi,ignoreRX)

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [elements, useMat, exitFlag, MW]=parseFormulas(formulas, noPolymers,isInchi,ignoreRX)
0002 % parseFormulas
0003 %   Gets the elemental composition from formulas
0004 %
0005 %   formulas      a cell array with formulas
0006 %   noPolymers    assume that all polymers consist of one element.
0007 %                 Corresponds to counting everything between (...)n as
0008 %                 n being equal to one. Only one set of parentheses
0009 %                 is allowed. If this is false then polymers are returned as
0010 %                 "Could not parse formula" (optional, default false)
0011 %   isInchi       true if the formulas are in the InChI format (optional,
0012 %                 default false)
0013 %   ignoreRX      ignore R-groups and bound protein. This can be useful since they
0014 %                 are often used only as intermediates (optional, default false)
0015 %
0016 %   elements
0017 %       abbrevs   cell array with abbreviations for all used elements
0018 %       names     cell array with the names for all used elements
0019 %   useMat        MxN matrix with the number of atoms for each formula (M) and each
0020 %                 element (N)
0021 %   exitFlag      array with the exit flags:
0022 %                 1=  Sucessful parsing
0023 %                 0=  No formula found
0024 %                 -1= Could not parse formula
0025 %   MW            predicted molecular weight (g/mol). This is only returned
0026 %                 for formulas which can be sucessfully parsed, and its
0027 %                 calculation doesn't affect the exitFlag variable. NaN is
0028 %                 returned if the weight couldn't be calculated
0029 %
0030 % Usage: [elements, useMat, exitFlag, MW]=
0031 %               parseFormulas(formulas, noPolymers,isInchi,ignoreRX)
0032 
0033 if nargin<2
0034     noPolymers=false;
0035 end
0036 if nargin<3
0037     isInchi=false;
0038 end
0039 if nargin<4
0040     ignoreRX=false;
0041 end
0042 
0043 elements.abbrevs={'C', 'N', 'O', 'S', 'P', 'H', 'He', 'Li', 'Be', 'B', 'F', 'Ne', 'Na', 'Mg', 'Al',...
0044     'Si', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni',...
0045     'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc',...
0046     'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce',...
0047     'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',...
0048     'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra',...
0049     'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',...
0050     'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'R', 'X'}';
0051 elements.names={'carbon', 'nitrogen', 'oxygen', 'sulfur', 'phosphorus', 'hydrogen', 'helium', 'lithium', 'beryllium', 'boron',...
0052     'fluorine', 'neon', 'sodium', 'magnesium', 'aluminum,', 'silicon',...
0053     'chlorine', 'argon', 'potassium', 'calcium', 'scandium', 'titanium', 'vanadium',...
0054     'chromium', 'manganese', 'iron', 'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium',...
0055     'arsenic', 'selenium', 'bromine', 'krypton', 'rubidium', 'strontium', 'yttrium', 'zirconium',...
0056     'niobium', 'molybdenum', 'technetium', 'ruthenium', 'rhodium', 'palladium', 'silver', 'cadmium',...
0057     'indium', 'tin', 'antimony', 'tellurium', 'iodine', 'xenon', 'cesium', 'barium', 'lanthanum',...
0058     'cerium', 'praseodymium', 'neodymium', 'promethium', 'samarium', 'europium', 'gadolinium',...
0059     'terbium', 'dysprosium', 'holmium', 'erbium', 'thulium', 'ytterbium', 'lutetium', 'hafnium',...
0060     'tantalum', 'tungsten', 'rhenium', 'osmium', 'iridium', 'platinum', 'gold', 'mercury',...
0061     'thallium', 'lead', 'bismuth', 'polonium', 'astatine', 'radon', 'francium', 'radium',...
0062     'actinium', 'thorium', 'protactinium', 'uranium', 'neptunium', 'plutonium', 'americium',...
0063     'curium', 'berkelium', 'californium', 'einsteinium', 'fermium', 'mendelevium', 'nobelium',...
0064     'lawrencium', 'rutherfordium', 'dubnium', 'seaborgium', 'bohrium', 'hassium', 'meitnerium',...
0065     'darmstadtium', 'roentgenium', 'copernicium', 'generic group', 'bound protein'}';
0066 
0067 EWs=[12.0107 14.0067 15.9994 32.065 30.973762 1.00794 4.002602 6.941 9.012182 10.811 18.9984032 ...
0068     20.1797 22.98976928 24.305 26.9815386 28.0855 35.453 39.948 39.0983 40.078 44.955912 47.867 50.9415 ...
0069     51.9961 54.938045 55.845 58.933195 58.6934 63.546 65.39 69.723 72.64 74.9216 78.96 79.904 83.798 ...
0070     85.4678 87.62 88.90585 91.224 92.906 95.94 97.9072 101.07 102.905 106.42 107.8682 112.411 114.818 ...
0071     118.71 121.76 127.6 126.904 131.293 132.9054519 137.327 138.90547 140.116 140.90765 144.242 144.9127 ...
0072     150.36 151.964 157.25 158.92535 162.5 164.93 167.259 168.93421 173.04 174.967 178.49 180.94788 183.84 ...
0073     186.207 190.23 192.217 195.084 196.966569 200.59 204.3833 207.2 208.9804 208.9824 209.9871 222.0176 ...
0074     223.0197 226.0254 227.0277 232.03806 231.03588 238.02891 237.0482 244.0642 243.0614 247.0704 247.0703 ...
0075     251.0796 252.083 257.0951 258.0984 259.101 262.1097 261.1088 262 266 264 277 268 271 272 nan nan nan]';
0076 
0077 %Set the EWs of these groups to 0
0078 if ignoreRX==true
0079     EWs(end-1:end)=0;
0080 end
0081 
0082 useMat=zeros(numel(formulas),numel(elements.abbrevs));
0083 
0084 exitFlag=zeros(numel(formulas),1);
0085 
0086 %"H+", if parsed from InChI code, would have the composition "p+1". Change
0087 %to fit with how the rest of the compounds are written
0088 formulas=strrep(formulas,'p+1','H+');
0089 
0090 %Ignore charge to make the parsing easier
0091 formulas=strrep(formulas,'+','');
0092 formulas=strrep(formulas,'-','');
0093 
0094 %Loop through each formula
0095 for i=1:numel(formulas)
0096     if ~isempty(formulas{i})
0097         sucess=false; %To see if it works
0098         formula=formulas{i};
0099         
0100         %If it's an InChI code. The composition is found between the first
0101         %and the second "/" For some simple molecules such as salts only
0102         %the first "/" is present
0103         if isInchi==true
0104             S=regexp(formula,'/','split');
0105             if numel(S)>=2
0106                 formula=S{2};
0107             else
0108                 formula='';
0109             end
0110         end
0111         %Only look at what's between the parantheses (polymers are not
0112         %supported in InChI)
0113         if isInchi==false
0114             LP=strfind(formula,'(');
0115             RP=strfind(formula,')n');
0116             %            if numel(LP)>1 || numel(RP)>1
0117             %               exitFlag(i)=-1; continue;
0118             %            end if numel(LP)>1 || numel(RP)>1
0119             %               exitFlag(i)=-1; continue;
0120             %            end
0121             if numel(LP)==1 && numel(RP)==1
0122                 %This means that the polymer should be regarded as a
0123                 %monomer
0124                 if noPolymers==true
0125                     %This means that there are one set of parantheses
0126                     formula=strrep(formula,'(','');
0127                     formula=strrep(formula,')n','');
0128                 else
0129                     %This means that polymers should be ignored
0130                     exitFlag(i)=-1;
0131                     continue;
0132                 end
0133             else
0134                 if ~isempty(LP) || ~isempty(RP)
0135                     exitFlag(i)=-1;
0136                     continue;
0137                 end
0138             end
0139         end
0140         
0141         %Get the indexes of the numeric (or ".") characters
0142         nonNumeric=false(numel(formula),1);
0143         nonNumeric(regexp(formula,'[^0-9.]'))=true;
0144         
0145         %Get the indexes of the upper characters (since each element starts
0146         %with an upper character)
0147         upperI=isstrprop(formula,'upper');
0148         upperX=find(upperI);
0149         
0150         for j=1:numel(upperX)
0151             %The first case is when it's the last character. Then the
0152             %coefficient must be 1
0153             isLast=false;
0154             if upperX(j)==numel(formula)
0155                 coeff=1;
0156                 element=formula(upperX(j));
0157                 isLast=true;
0158             end
0159             
0160             if isLast==false
0161                 %The second case is when the following character is a
0162                 %character
0163                 if nonNumeric(upperX(j)+1)
0164                     %Is it a new element?
0165                     if upperI(upperX(j)+1)
0166                         %New element, that means that the coefficient was 1
0167                         %and that the element was only one character
0168                         coeff=1;
0169                         element=formula(upperX(j));
0170                     else
0171                         %This means that it's an element with two
0172                         %characters
0173                         if j==numel(upperX)
0174                             if upperX(j)<numel(formula)-1
0175                                 coeff=str2double(formula(upperX(j)+2:end));
0176                             else
0177                                 coeff=1;
0178                             end
0179                         else
0180                             %Check if there is a number or a new element
0181                             %after it
0182                             if nonNumeric(upperX(j)+2)==true
0183                                 coeff=1;
0184                             else
0185                                 coeff=str2double(formula(upperX(j)+2:upperX(j+1)-1));
0186                             end
0187                         end
0188                         element=formula(upperX(j):upperX(j)+1);
0189                     end
0190                 else
0191                     %Then it is a numeral
0192                     if j==numel(upperX)
0193                         coeff=str2double(formula(upperX(j)+1:end));
0194                     else
0195                         coeff=str2double(formula(upperX(j)+1:upperX(j+1)-1));
0196                     end
0197                     element=formula(upperX(j));
0198                 end
0199             end
0200             
0201             %Find the element
0202             I=strcmp(element,elements.abbrevs);
0203             if any(I)
0204                 if ~isnan(coeff)
0205                     useMat(i,I)=useMat(i,I)+coeff;
0206                     sucess=true;
0207                 else
0208                     break;
0209                 end
0210             else
0211                 break;
0212             end
0213         end
0214         if sucess==false
0215             useMat(i,:)=0; %Reset for this formula
0216             exitFlag(i)=-1;
0217         else
0218             exitFlag(i)=1;
0219         end
0220     end
0221 end
0222 
0223 %Remove the elements which are never used
0224 I=~any(useMat);
0225 useMat(:,I)=[];
0226 elements.abbrevs(I)=[];
0227 elements.names(I)=[];
0228 EWs(I)=[];
0229 
0230 %Calcluate the weight contribution of each element in each formula. Note
0231 %that this will give NaNs for all formulas if R or X groups are in EWs,
0232 %since 0*NaN is still NaN. Therefore only use elements with known mass
0233 if nargout>3
0234     P=bsxfun(@times,useMat(:,~isnan(EWs)),EWs(~isnan(EWs)).');
0235     MW=sum(P,2);
0236     
0237     %Then remove the calculations for elements with unknown mass
0238     I=find(useMat(:,isnan(EWs)));
0239     MW(I)=nan;
0240     MW(exitFlag~=1)=nan;
0241 end
0242 end

Generated by m2html © 2005