CCL Home Page
Up Directory CCL smilesto.c
/*
   smilesto.c

   converts SMILES strings to connection tables

   Simon Kilvington, University of Southampton, 1995
*/

#include "bbltyp.h"

#define isatomchr(C)		(isalpha(C) || (C) == '*')

/*
   smilestocontab
   uses a finite state machine to convert a SMILES string to a connection table
   returns NULL if any trouble (invalid SMILES or no memory) or a ptr to the alloc'd fragment
   only the first molecule specified in the string will be constructed, terminators are fullstop, space, newline, and end of string
   differences between the SMILES used by this and "real" SMILES strings are:
   1. charges, isotope numbers, and attached hydrogens can not be specified inside an atoms brackets, eg [NH4+]
   2. as coords are not generated symmetry symbols (ie @, @@, /, and \) can not be used
!!!!!!!!!!!! change it so these are just ignored rather than causing parsing errors !!!!!!!!!!!!!!!!!
*/

#define MAXSMILESRINGS 100	/* max ring closure number +1 */

smilescontab_t *
smilestocontab(char *smiles)
{
   int i, b, rno, loss, lastatom, maxbranches, bstackptr, *bstack = NULL, ringatom[MAXSMILESRINGS];
   char atname[2], *bord = NULL;
   smilescontab_t *frag = NULL;
   smilesatom_t *aptr = NULL;
   smilesbond_t nextbond;
   int toggle;

   static const char bondchr[] = "~-=#:";

/* the states */
   enum state_typ { St_Start, St_Name1, St_OpenSquare, St_Name2, St_CloseSquare, St_Number, St_CloseBranch, St_OpenBranch, St_Bond, St_End, St_Error } state;

/* the tokens */
   enum {Tok_Symbol, Tok_Bond, Tok_Number, Tok_OpenSquare, Tok_CloseSquare, Tok_OpenBranch, Tok_CloseBranch, Tok_End };

/* the state machine */
   static const int next[9][8] = { {St_Name1, St_Error, St_Error,  St_OpenSquare, St_Error,       St_Error,      St_Error,       St_End},
				   {St_Name1, St_Bond,  St_Number, St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
				   {St_Name2, St_Error, St_Error,  St_Error,      St_Error,       St_Error,      St_Error,       St_Error},
				   {St_Error, St_Error, St_Error,  St_Error,      St_CloseSquare, St_Error,      St_Error,       St_Error},
				   {St_Name1, St_Bond,  St_Number, St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
				   {St_Name1, St_Bond,  St_Number, St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
				   {St_Name1, St_Bond,  St_Error,  St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
				   {St_Name1, St_Bond,  St_Error,  St_OpenSquare, St_Error,       St_Error,      St_Error,       St_Error},
				   {St_Name1, St_Error, St_Error,  St_OpenSquare, St_Error,       St_Error,      St_Error,       St_Error}  };

   for(i=0; inatoms = 0;

   maxbranches = strutils_noccurrences(smiles, '(');
   if (maxbranches > 0)
     if(!(bstack = malloc(maxbranches * sizeof(int))))
     {
       printf("No memory to convert SMILES string\n");
       free(frag);
       return NULL;
     }
   
   i = 0;			/* index into smiles string */
   bstackptr = 0;		/* no of branches on the stack */
   state = St_Start;
   while(state != St_Error && state != St_End)
   {
   /* see what the next token in the string is, and move onto the appropriate state */
      if(strchr(". \n", smiles[i]) != NULL)		state = (enum state_typ) next[state][Tok_End];
      else if(isatomchr(smiles[i]))			state = (enum state_typ) next[state][Tok_Symbol];
      else if(bord = strchr(bondchr, smiles[i]))	state = (enum state_typ) next[state][Tok_Bond];
      else if(isdigit(smiles[i]) || smiles[i] == '%')	state = (enum state_typ) next[state][Tok_Number];
      else if(smiles[i] == '[')				state = (enum state_typ) next[state][Tok_OpenSquare];
      else if(smiles[i] == ']')				state = (enum state_typ) next[state][Tok_CloseSquare];
      else if(smiles[i] == '(')				state = (enum state_typ) next[state][Tok_OpenBranch];
      else if(smiles[i] == ')')				state = (enum state_typ) next[state][Tok_CloseBranch];
      else 						state = St_Error;			/* invalid character */

   /* do we need to do anything in this state */
      switch(state)
      {
      case St_Name1:
      /* check it's not "Cl" or "Br" */
	 if(strncmp(&smiles[i], "Cl", 2) == 0 || strncmp(&smiles[i], "Br", 2) == 0)
	 {
	    atname[0] = smiles[i];
	    atname[1] = smiles[i+1];
	    i+=2;
	 }
	 else
	 {
	    atname[0] = smiles[i];
            atname[1] = '\0';
	    i++;
	 }
	 if(!addatomtocontab(&frag, atname, &lastatom, nextbond))
	    state = St_Error;
	 nextbond = SMILESBOND_Single;
	 break;

      case St_Name2:
	 if(smiles[i+1] == ']')
	 {
	    atname[0] = smiles[i];
	    atname[1] = '\0';
	    i++;
	 }
	 else
	 {
	    atname[0] = smiles[i];
	    atname[1] = smiles[i+1];
	    i += 2;
	 }
	 if(!addatomtocontab(&frag, atname, &lastatom, nextbond))
	    state = St_Error;
	 nextbond = SMILESBOND_Single;
	 break;

      case St_Number:
	 if(smiles[i] == '%')
	 {
	    rno = (10 * (smiles[i+1] - '0')) + (smiles[i+2] - '0');
	    i += 3;
	 }
	 else
	 {
	    rno = smiles[i] - '0';
	    i++;
	 }
	 if(ringatom[rno] == -1)		/* start of ring */
	 {
	    ringatom[rno] = lastatom;
	 }
	 else					/* end of ring */
	 {
	    if(!closecontabring(frag, ringatom[rno], lastatom))
	       state = St_Error;
	    ringatom[rno] = -1;
	 }
	 break;

      case St_Bond:
	 nextbond = (bord - bondchr);
	 i++;
	 break;

      case St_OpenBranch:
	 bstack[bstackptr++] = lastatom;
	 i++;
	 break;

      case St_CloseBranch:
	 if(bstackptr == 0)
	 {
	    printf("Unmatched closing branch bracket in SMILES string at character %d\n", i+1);
	    state = St_Error;
	 }
	 else
	 {
	    lastatom = bstack[--bstackptr];
	    i++;
	 }
	 break;

      case St_End:
	 if(frag->natoms == 0)
	 {
	    printf("SMILES string contains no atoms\n");
	    state = St_Error;
	 }
	 break;

      case St_Error:
         printf("Error parsing SMILES string at character %d \"%c\"\n", i+1, smiles[i]);
	 break;

      default:
	 i++;
	 break;
      }
   }

/* check we are not in the middle of any rings or branches */
   if(state != St_Error && bstackptr > 0)
   {
      printf("Unmatched brackets in SMILES string\n");
      state = St_Error;
   }

   for(i=0; state != St_Error && inatoms; i++)
      {
	 aptr = &frag->atom[i];
	 aptr->symbol[0] = toupper(aptr->symbol[0]);
	 aptr->symbol[1] = tolower(aptr->symbol[1]);
      }
   }

   if (bstack)
   {
     free(bstack);
     bstack = NULL;
   }

   return frag;
}

#undef MAXSMILESRINGS

/*
   addatomtocontab
   adds an extra atom to *frag and bonds it to *lastatom (unless the new atom is the first one) with the given bond type
   if bondtype is SINGLE and both atoms have lower case names an AROMATIC bond is used instead
   updates *lastatom to be the index of the atom just added (ie the last atom in the fragment)
   returns FALSE if any trouble (no memory, unknown atom type, or too many bonds to *lastatom)
*/

int
addatomtocontab(smilescontab_t **frag, char *atname, int *lastatom, smilesbond_t bondtype)
{
   smilescontab_t *newptr;
   int type, newatom, b, btype, bbits;
   smilesatom_t *aptr, *bptr;
   char upper[2], pdbatm[8];
   int uppercase, err;

   err = FALSE;

   uppercase = !aromaticsmilessym(atname);
   upper[0] = toupper(atname[0]);
   upper[1] = toupper(atname[1]);

   if(!uppercase && bondtype == SMILESBOND_Single)
   {
      if(aromaticsmilessym((*frag)->atom[*lastatom].symbol))
         bondtype = SMILESBOND_Aromatic;
   }

   newatom = (*frag)->natoms;

   if(!(newptr = realloc((*frag), smiles_CONTABHDRSIZE + ((*frag)->natoms + 1) * sizeof(smilesatom_t))))
   {
      printf("No memory to build fragment\n");
      return FALSE;
   }
   (*frag) = newptr;
   (*frag)->natoms ++;

/* set up "newatom" and bond it to "*lastatom" (unless newatom is the first) */
   aptr = &((*frag)->atom[newatom]);
   strncpy(aptr->symbol, atname, 2);
   if(newatom == 0)
   {
      for(b=0; bbondedto[b] = md_NOBOND;
	 aptr->bondtype[b] = SMILESBOND_NoBond;
      }
   }
   else
   {
      bptr = &((*frag)->atom[*lastatom]);
      b = nextfreebondto(bptr);
      if(b != -1)
      {
	 bptr->bondedto[b] = newatom;
	 bptr->bondtype[b] = bondtype;
	 aptr->bondedto[0] = *lastatom;
	 aptr->bondtype[0] = bondtype;
	 for(b=1; bbondedto[b] = md_NOBOND;
	    aptr->bondtype[b] = SMILESBOND_NoBond;
	 }
      }
      else
      {
	 printf("Too many bonds to atom %d\n", (*lastatom)+1);
	 err = TRUE;
      }
   }

/* set up the return values */
   *lastatom = newatom;

   return !err;
}

/*
   closecontabring
   bonds atom1 and atom2 using either a SINGLE or AROMATIC bond depending on whether the names of both are lower case or not
   returns FALSE if any trouble (too many bonds to one of the atoms)
*/

int
closecontabring(smilescontab_t *frag, int atom1, int atom2)
{
   int bto1, bto2;
   smilesatom_t *a1ptr, *a2ptr;
   smilesbond_t btype;
   int lower1, lower2;

   a1ptr = &frag->atom[atom1];
   a2ptr = &frag->atom[atom2];

   if((bto1 = nextfreebondto(a1ptr)) == -1)
   {
      printf("Too many bonds to atom %d\n", atom1+1);
      return FALSE;
   }
   if((bto2 = nextfreebondto(a2ptr)) == -1)
   {
      printf("Too many bonds to atom %d\n", atom2+1);
      return FALSE;
   }

   lower1 = aromaticsmilessym(a1ptr->symbol);
   lower2 = aromaticsmilessym(a2ptr->symbol);

   btype = (lower1 && lower2) ? SMILESBOND_Aromatic : SMILESBOND_Single;

   a1ptr->bondedto[bto1] = atom2;
   a1ptr->bondtype[bto1] = btype;

   a2ptr->bondedto[bto2] = atom1;
   a2ptr->bondtype[bto2] = btype;

   return TRUE;
}

/*
   aromaticsmilessym
   returns TRUE if the given SMILES atom symbol is lower case, and therefore aromatic
*/

int
aromaticsmilessym(char *sym)
{
   return (sym[0] == tolower(sym[0]));		/* as metals may be called 'Zn' etc */
}

/*
   nextfreebondto
   returns index of first bondedto[] that is md_NOBOND, or -1 if there are no free bonds
*/

int
nextfreebondto(smilesatom_t *aptr)
{
   int f;
   int freebond = FALSE;

   for(f=0; fbondedto[f] == md_NOBOND);

   return (freebond) ? f-1 : -1;
}

/*
   strutils_noccurrances
   counts the number of times the given character appears in the given string
*/

int
strutils_noccurrences(char *buffer, char c)
{
   int i, n;

   i = n = 0;
   while(buffer[i] != '\0')
   {
      n += (buffer[i] == c);
      i++;
   }

   return n;
}
Modified: Tue Jan 21 17:00:00 1997 GMT
Page accessed 7143 times since Sat Apr 17 21:36:58 1999 GMT