CCL Home Page
Up Directory CCL tosmiles.c
/*
   tosmiles.c

   converts connection tables to unique SMILES strings

   Simon Kilvington, University of Southampton, 1995
*/

#include "bbltyp.h"

/*
   contabtosmiles
   converts the given fragment into a unique SMILES string
   uses the algorithm described by the Weiningers in J.Chem.Inf.Comput.Sci. 29 (1989), 97-101
   returns a ptr to the string which needs to be free'd when you've finished with it
   returns NULL if any problems
   it will explicitly include any H atoms in frag in the resulting SMILES, so it's best to remove all H atoms from frag before calling this
*/

char *
contabtosmiles(smilescontab_t *frag)
{
   char *smiles;
   int *rank, *primesum, *tmprank, *primelist;
   int a, b, c, n, bto, nranks, lowest;
   int nbonds, nprimes;
   int prime, same;
   smilesatom_t *aptr;
   block_ptr blk;
   int i;

   if(!block_alloc(&blk, "iiii", &rank, frag->natoms, &primesum, frag->natoms, &tmprank, frag->natoms, &primelist, frag->natoms))
      return NULL;

/* generate an array of the first "frag->natoms" prime numbers */
   nprimes = 0;
   for(n=2; nprimesnatoms; n++)
   {
      prime = TRUE;
      for(c=2; c<=(n/2) && prime; c++)
	 prime = ((n % c) != 0);
      if(prime)
         primelist[nprimes++] = n;
   }

/* determine the initial graph invariants for each atom, store them in primesum */
   for(a=0; anatoms; a++)
   {
      aptr = &frag->atom[a];
   /* no of connections */
      nbonds = nbondsto(aptr);
      primesum[a] = nbonds * 100000;
   /* no of non-H bonds (frag should contain no H atoms) */
      primesum[a] += nbonds * 10000;
#if 0	/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
   /* atomic no */
      primesum[a] += md_atomicnumber(aptr->symbol) * 100;
   /* sign and absolute value of charge */
      primesum[a] += md_chargeonatom(aptr) * 10;
   /* no of attached H's */
      primesum[a] += (md_maxvalency(aptr->symbol) - nbonds);
#endif	/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
   }

   do
   {
   /* rank each atom based on primesum values (initially these are the invariants, on subsequent loops they are the tie breaker ranks) */
      sortranks(frag->natoms, primesum, rank);
      do
      {
      /* generate the product of the "corresponding primes" of each atoms neighbours */
	 for(a=0; anatoms; a++)
	 {
	    aptr = &frag->atom[a];
	    primesum[a] = 1;
	    for(b=0; bbondedto[b];
	       primesum[a] *= (bto != -1) ? primelist[rank[bto]-1] : 1;		/* -1 as array indices start at 0 */
	    }
	 }
   
      /* resolve ties in "rank" list using the "primesum" values, put the new ranks in "tmprank" */
	 nranks = resolveties(frag->natoms, rank, primesum, tmprank);
   
      /* see if the ranks are the same as last time */
	 same = TRUE;
	 for(a=0; anatoms && same; a++)
	    same = (rank[a] == tmprank[a]);

      /* move the sorted data back into rank array */
         memcpy(rank, tmprank, frag->natoms * sizeof(int));
      }
      while(!same);
   
   /* see if we have any tied atoms */
      if(nranks != frag->natoms)		/* break ties */
      {
      /* find lowest tied rank */
	 lowest = 0;
	 do
	 {
	    lowest++;
	    c = 0;
	    for(a=0; anatoms; a++)
	       c += (rank[a] == lowest);
	 }
	 while(c == 1);
      /* double all the ranks, store them in primesum array */
	 for(a=0; anatoms; a++)
	    primesum[a] = rank[a] * 2;
      /* reduce the first lowest tied one by 1 */
	 for(a=0; primesum[a] != (lowest * 2); a++)
	    ;
	 primesum[a]--;
      }

   }
   while(nranks != frag->natoms);

/* frag is now cannonicalised, lets get on with generating the smiles string */

   smiles = generatesmilesstring(frag, rank);

/* clean up */
   block_free(&blk);

   return smiles;
}

/*
   sortranks
   generates a list of ranks in "rank" based on the numbers in "value" (ie lowest value is ranked 1 etc)
   returns number of different ranks
   the value array will be corrupted, it'll end up as all -1's
*/

int
sortranks(int natoms, int *value, int *rank)
{
   int a, sort, lowest;

   sort = 1;
   do
   {
   /* find the lowest value that isnt -1, our tag for values that have already been sorted out (ho ho!) */
      lowest = -1;		/* not found yet */
      for(a=0; a 0)
	 {
	    for(a=0; a 0);
   }

   return nranks-1;
}

/*
   generatesmilesstring
   generates a smiles string for the fragment based on the given atomic ranks
*/

typedef struct				/* an array of these holds the data about each atom generated by the first pass thru the fragment */
{					/* the second pass uses these to write out the smiles string */
   char          symbol[2];		/* atom symbol */
   int          aromatic;		/* TRUE if ANY of the bonds to it are aromatic */
   int           ring[md_MAXBONDS];	/* ring closure numbers associated with this atom */
   int          rbracket;		/* does this atom end a branch */
   int          lbracket;		/* does the NEXT atom start a branch */
   int          endfrag;		/* do we need a "." after this atom to separate two structures */
   smilesbond_t bond;			/* order of bond to next atom */
} _smileyatomdata;

char *
generatesmilesstring(smilescontab_t *frag, int *rank)
{
   char *smiles;
   char tmpstr[8];
   int *bracketstack, *ringstack, *neworder;		/* neworder[i] is the the index of frag->atom[i] in the atomdata array */
   int bsptr, rsptr;
   int a, b, bto, f, lowest, tmp, smileslen;
   int smipos, atom, nextatom, ringatom, newring, closeatom, maxbonds, natoms, nrings;
   smilesbond_t order;
   int *visited;
   block_ptr blk;
   _smileyatomdata *atomdata, initatomdata;

   static const char bondchr[] = "-=#:";

   natoms = frag->natoms;

   if(!(atomdata = malloc(natoms * sizeof(_smileyatomdata))))
      return NULL;

   if(!block_alloc(&blk, "iiiB", &bracketstack, natoms, &ringstack, natoms*md_MAXBONDS, &neworder, natoms, &visited, natoms))
   {
      free(atomdata);
      return NULL;
   }

/* set up the initial (blank) atomdata entry */
   initatomdata.symbol[0] = '\0';
   initatomdata.symbol[1] = '\0';
   initatomdata.aromatic = FALSE;
   for(b=0; bnatoms; a++)	/* haven't visited any atoms yet */
      visited[a] = FALSE;
   natoms = 0;				/* number of atoms we have in our smiley atom data array so far */

/* find the atom with rank 1 */
   atom = lowestunvisitedatom(frag->natoms, rank, visited);

/* first pass, determine which order the atoms should appear in, and store some attributes of each one - start with the one we have just found */
   while(natoms < frag->natoms && atom != -1)
   {
   /* set up the atomdata entry for atom number "atom" */
      atomdata[natoms] = initatomdata;
      strncpy(atomdata[natoms].symbol, frag->atom[atom].symbol, 2);
      atomdata[natoms].aromatic = hasaromaticbonds(&frag->atom[atom]);
   /* remember we have visited it, and store its index into the atomdata array */
      visited[atom] = TRUE;
      neworder[atom] = natoms;
   /* if there are no unvisited atoms attached to this atom */
      if(nunvisitedbondsto(&frag->atom[atom], visited) == 0)
      {
      /* jump back to the atom on top of the branch stack if there is one */
	 if(bsptr > 0)
	 {
         /* add an end of branch bracket */
            atomdata[natoms].rbracket = TRUE;
         /* go back to the start of the branch */
	    bsptr --;
	    atom = bracketstack[bsptr];
	 }
         else
	 {
	 /* is it time to start a new structure, or have we reached the end of the molecule */
	    if(natoms != frag->natoms-1)
	    {
	       atomdata[natoms].endfrag = TRUE;
	       atom = lowestunvisitedatom(frag->natoms, rank, visited);
	    }
         /* increase the number of atoms we have seen, ready for next time */
            natoms ++;
	    continue;
	 }
      }
   /* there are unvisited atoms attached to this one */
/*!!!!!!!! what if the atom we pulled off the branch stack has no unvisited atoms attached to it?  !!!!!!!!!!*/
/*!!!!!!!! this can only happen if it's the last atom, or the end of a substructure                !!!!!!!!!!*/
nextatom=-1;
   /* find the lowest ranked, unvisited, atom attached to "atom", this is the next one to visit */
      lowest = frag->natoms+1;
      for(b=0; batom[atom].bondedto[b];
	 if(bto != md_NOBOND && !visited[bto] && rank[bto] < lowest)
	 {
	    lowest = rank[bto];
	    nextatom = bto;
	 }
      }
/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
if(nextatom==-1)
{
   printf("natoms=%d\tatom=%d\tnext=-1\n", natoms, atom+1);
   atom=-1;
   continue;
}
/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
   /* count how many bonds from this atom form rings */
      nrings = 0;
   /* see if "nextatom" forms a ring, ie can we follow "nextatom" and get back to "atom" again */
      ringatom = findunvisitedring(frag, atom, nextatom, visited);
   /* did we find a ring */
      if(ringatom != -1)
      {
      /* is the bond order between "atom" and "ringatom" double or triple */
	 order = getbondorder(&frag->atom[atom], ringatom);
	 if(order == SMILESBOND_Double || order == SMILESBOND_Triple)
	 {
	 /* follow "ringatom" instead of "nextatom" so we dont have to close the ring with a multiple bond */
	    tmp = nextatom;
	    nextatom = ringatom;
	    ringatom = tmp;
	 }
      /* store the two atoms on the ring closure stack */
	 ringstack[rsptr++] = atom;
	 ringstack[rsptr++] = ringatom;
      /* increase our ring count */
	 nrings ++;
      /* see if any of the other unvisited atoms connected to this one also form a ring */
	 for(b=0; batom[atom].bondedto[b];
	    if(bto == md_NOBOND || bto == nextatom || bto == ringatom || visited[bto])
	       continue;
	 /* does atom "bto" form a ring (ie can we trace a path from it back to "atom") */
	    newring = findunvisitedring(frag, atom, bto, visited);
	    if(newring != -1)
	    {
	    /* add the two atoms to the ring stack */
	       ringstack[rsptr++] = atom;
	       ringstack[rsptr++] = bto;
	    /* count how many bonds form rings */
	       nrings ++;
	    }
	 }
      }
   /* see if this is a branch point, ie more than 1 unvisited bonds to it that don't form rings */
      maxbonds = 1 + nrings;
      if(nunvisitedbondsto(&frag->atom[atom], visited) > maxbonds)
      {
	 atomdata[natoms].lbracket = TRUE;
      /* store this atom on the branch stack */
	 bracketstack[bsptr] = atom;
	 bsptr ++;
      }
   /* store the bond order between this atom and the next */
      atomdata[natoms].bond = getbondorder(&frag->atom[atom], nextatom);
   /* move onto the next atom */
      atom = nextatom;
   /* increase the number of atoms we have seen, ready for next time */
      natoms ++;
   }

/* did anything go wrong */
   if(natoms != frag->natoms)
   {
      free(atomdata);
      block_free(&blk);
      return NULL;
   }

/* pull the ring closure atoms off the ringstack and convert them into ring closure numbers */
   nrings = rsptr / 2;
   while(nrings > 0)
   {
      ringatom  = neworder[ringstack[--rsptr]];		/* convert the frag->atom index into an atomdata index */
      closeatom = neworder[ringstack[--rsptr]];
   /* mark the ring closure in atomdata[] */
      for(f=0; atomdata[ringatom].ring[f] != -1; f++)	/* find next free entry */
	    ;
      atomdata[ringatom].ring[f] = nrings;
      for(f=0; atomdata[closeatom].ring[f] != -1; f++)
	    ;
      atomdata[closeatom].ring[f] = nrings;
      nrings--;
   };

/* work out the max length "smiles" needs to be */
   smileslen = 1;					/* don't forget the terminator */
   for(a=0; anatoms; a++)
   {
   /* the atom name */
      smileslen += addsmilesatom(tmpstr, atomdata[a].symbol, atomdata[a].aromatic);
   /* ring closure numbers */
      for(b=md_MAXBONDS-1; b>=0; b--)
      {
         nrings = atomdata[a].ring[b];
	 if(nrings != -1)
	    smileslen += (nrings > 9) ? 3 : 1;		/* do we need a "%xx" or just a "x" */
      }
   /* right brackets */
      if(atomdata[a].rbracket)
	 smileslen ++;
   /* left brackets */
      if(atomdata[a].lbracket)
	 smileslen ++;
   /* bond symbols - we never need to write aromatic bond symbols because the bound atoms name's will be in lower case */
      order = atomdata[a].bond;
      if(order == SMILESBOND_Double || order == SMILESBOND_Triple)
	 smileslen ++;
   /* substructure separators */
      if(atomdata[a].endfrag)
         smileslen ++;
   }

/* alloc some space for "smiles" */
   if(!(smiles = malloc(smileslen)))
   {
      free(atomdata);
      block_free(&blk);
      return NULL;
   }

/* second pass - generate the smiles string using the data we have just set up */
   smipos = 0;				/* smipos is index of terminator in smiles string */
   for(a=0; anatoms; a++)
   {
   /* print the atom name */
      smipos += addsmilesatom(&smiles[smipos], atomdata[a].symbol, atomdata[a].aromatic);
   /* ring numbers are stored in reverse numerical order, so print them out backwards */
      for(b=md_MAXBONDS-1; b>=0; b--)
      {
         nrings = atomdata[a].ring[b];
	 if(nrings != -1)
	    smipos += addsmilesring(&smiles[smipos], nrings);
      }
   /* does this atom end a branch */
      if(atomdata[a].rbracket)
	 smiles[smipos++] = ')';
   /* does the atom end a substructure */
      if(atomdata[a].endfrag)
         smiles[smipos++] = '.';
   /* does the next atom start a branch */
      if(atomdata[a].lbracket)
	 smiles[smipos++] = '(';
   /* we never need to write aromatic bond symbols because the bound atoms name's will be in lower case */
      order = atomdata[a].bond;
      if(order == SMILESBOND_Double || order == SMILESBOND_Triple)
	 smiles[smipos++] = bondchr[order - SMILESBOND_Single];
   }
   smiles[smipos] = '\0';

   free(atomdata);
   block_free(&blk);

   return smiles;
}

/*
   addsmilesatom
   adds the smiles symbol(s) (in lower case if aromatic is TRUE) to the string
   returns the number of characters added
*/

#define NINSUBSET 10

int
addsmilesatom(char *string, char *symbol, int aromatic)
{
   int n, s;

   static const char *organicsubset[NINSUBSET] = { "B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I" };

/* if the symbol is not in the "organic subset" we need to enclose it in square brackets */
   for(s=0; s < NINSUBSET && strncmp(symbol, organicsubset[s], 2) != 0; s++)
      ;

/* count how many chars we add to the string */
   n = 0;

   if(s == NINSUBSET)
      string[n++] = '[';

   string[n++] = (aromatic) ? tolower(symbol[0]) : symbol[0];

/* is it a one letter symbol */
   if(symbol[1] != '\0')
      string[n++] = (aromatic) ? tolower(symbol[1]) : symbol[1];

   if(s == NINSUBSET)
      string[n++] = ']';

   return n;
}

#undef NINSUBSET

/*
   addsmilesring
   adds a ring closure digit (or %xx if ringnum > 9) to the given string
   returns the number of characters it added
*/

int
addsmilesring(char *string, int ringnum)
{
   int n;

   if(ringnum < 10)
   {
      *string = '0' + ringnum;
      n = 1;
   }
   else
   {
      n = (int)sprintf(string, "%%%d", ringnum);
   }

   return n;
}

/*
   lowestunvisitedatom
   returns the index of the atom which has "visited[]" set to FALSE and the lowest value in "rank[]"
*/

int
lowestunvisitedatom(int natoms, int *rank, int *visited)
{
   int a, lowestatom, lowestrank;

   lowestatom = -1;
   for(a=0; anatoms * sizeof(int))))
      return -1;

   ring = FALSE;
   for(b=0; batom[start].bondedto[b];
      if(close != md_NOBOND && close != branch && !visited[close])
      {
      /* take a copy of the visited array as we will need to change it */
         memcpy(done, visited, frag->natoms * sizeof(int));
      /* can we trace a path from "start", along the bond to "close", and end up at "branch" ? */
	 ring = dotheymeet(frag, start, close, branch, done);
      }
   }

   free(done);

   return ring ? close : -1;
}

/*
   dotheymeet
   recursively searches from "next" (but not along the bond to "start") for "target"
   only visits an atom i if visited[i]==FALSE
   all atoms that are visited will have visited[i] set to TRUE
   returns TRUE if it was found
*/

int
dotheymeet(smilescontab_t *frag, int start, int next, int target, int *visited)
{
   int b, bto;
   int meet;

/*!!!!! could we store "visited" for each atom connected to "next" here, then restore it before we return? !!!!!*/

   meet = FALSE;
   for(b=0; batom[next].bondedto[b];
      if(bto != md_NOBOND && bto != start && !visited[bto])
      {
	 visited[bto] = TRUE;
	 meet = (bto == target) ? TRUE : dotheymeet(frag, next, bto, target, visited);
      }
   }

   return meet;
}

/*
   nunvisitedbondsto
   returns the number of bonds to the given atom
   bonds are only counted if visited[i] == FALSE where i is the index of the atom bound to this one
*/

int
nunvisitedbondsto(smilesatom_t *aptr, int *visited)
{
   int b, n, bto;

   n=0;
   for(b=0; bbondedto[b];
      n += ((bto != md_NOBOND) && !visited[bto]);
   }

   return n;
}

/*
   nbondsto
   returns the total number of atoms bonded to this one
*/

int
nbondsto(smilesatom_t *aptr)
{
   int b, nbonds;

   nbonds = 0;
   for(b=0; bbondedto[b] != md_NOBOND);

   return nbonds;
}

/*
   getbondorder
   returns the bond order of the bond from the atom pointed to by "aptr" to atom number "atom"
   if they are not bonded it returns SMILESBOND_NoBond
*/

smilesbond_t
getbondorder(smilesatom_t *aptr, int atom)
{
   int b;
   smilesbond_t order;

   order = SMILESBOND_NoBond;
   for(b=0; bbondedto[b] == atom)
	 order = aptr->bondtype[b];
   }

   return order;
}

/*
   hasaromaticbonds
   returns TRUE if there are one or more aromatic bonds to the given atom
*/

int
hasaromaticbonds(smilesatom_t *aptr)
{
   int b;
   int has = FALSE;

   for(b=0; bbondedto[b] != md_NOBOND && aptr->bondtype[b] == SMILESBOND_Aromatic);

   return has;
}
Modified: Tue Jan 21 17:00:00 1997 GMT
Page accessed 354 times since Mon Mar 14 23:23:30 2005 GMT