CCL Home Page
Up Directory CCL rdpdb.c
/*****
This file is part of the Babel Program
Copyright (C) 1992-96 W. Patrick Walters and Matthew T. Stahl 
All Rights Reserved 
All Rights Reserved 
All Rights Reserved 

For more information please contact :

babel@mercury.aichem.arizona.edu
--------------------------------------------------------------------------------

FILE : rdpdb.c
AUTHOR(S) : Pat Walters
DATE : 10-92
PURPOSE : Routines to read a Brookhave Protien Data Bank file

******/
#include "bbltyp.h"

#define DELIMS "\t\n "
#define PDBATOM 0
#define PDBHETATM 1

#include    
#include                 
#include    
#include    

int 
read_pdb(FILE *file1, ums_type *mol)
{
  char pdb_line[BUFF_SIZE];
  int i = 0;
  int result;
  char temp_char;
  long int pos;
  int chain_num = 1;
  char res_num_str[5];
  char the_atm_id[5],the_res_name[5]; 
  char the_serial_num[6];

#ifdef DOTIME
  struct timeval s_val;
  double startTime, endTime;
#endif

  pos = ftell(file1);

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  startTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
#endif

  while ((fgets(pdb_line,sizeof(pdb_line), file1) != NULL) && NOTEQn(pdb_line,"END",3))
  { 
    if (EQn(pdb_line,"ATOM",4) || EQn(pdb_line,"HETATM",6))
    {
      i++;
    }
  }

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Getting Atoms: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  fseek(file1,pos,0);
  Atoms = i;

  initialize_ums(&mol);
  initialize_residues(&mol);
  ShowProgress(Atoms,"Reading Atoms");
	 
  i = MIN_ATOM;
  while ((fgets(pdb_line,sizeof(pdb_line), file1) != NULL) && NOTEQn(pdb_line,"END",3))
  {
    if (EQn(pdb_line,"TER",3))
      chain_num++;
    if (EQn(pdb_line,"ATOM",4) || EQn(pdb_line,"HETATM",6))
    {
      UpdateProgress();
      parse_atom_record(&pdb_line[6], i, mol);
      if (pdb_line[0] == 'A')
        stringToType(PDBATOM, i, mol);
      else 
        stringToType(PDBHETATM, i, mol);
      ChainNum(i) = chain_num;
      i ++;
    }
  }

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Reading Atoms: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  /*
   * Make sure Valence is zeroed out
   */

  Bonds = 0;
  for (i = MIN_ATOM; i <= Atoms; i++)
  {
    Valence(i) = 0;
  }

  fseek(file1,pos,0);
  process_connect_records(file1,mol); 

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Reading CONECT: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  result = assign_radii(mol);  
  if (!result) 
  {
    release_ums(mol);
    return(FALSE);
  }

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Assigning Radii: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  if ((mol->control) &&  (NOTEQ(InputKeywords,"CONECT_ONLY")))
  {
    result = assign_pdb_bonds(mol);
    if (!result)
      {
	release_ums(mol);
	return(FALSE);
      }
  }
  

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Assigning PDB bonds: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  qsort(mol->connections,Bonds,sizeof(connect_type),QSORT_PROTO sort_connections);
  dissect_connection_table(mol);

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Dissect Connection Table: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  if ((mol->control) &&  (NOTEQ(InputKeywords,"CONECT_ONLY")))
    check_bonds(mol);

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Check Bonds: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  result = assign_types(mol);

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Assigning Types: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  if (!result)
  {
    release_ums(mol);
    return(FALSE);
  }
  assign_bond_order(mol);

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Assigning Bond Order: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  result = build_connection_table(mol);

#ifdef DOTIME
  gettimeofday(&s_val, 0);
  endTime = (double) s_val.tv_sec + 0.000001*s_val.tv_usec;
  printf("Building Connection Table: %lf\n", endTime - startTime);
  startTime = endTime;
#endif

  if (!result)
  {
    release_ums(mol);
    return(FALSE);
  }
  qsort(mol->connections,Bonds,sizeof(connect_type),QSORT_PROTO sort_connections);
  dissect_connection_table(mol);
  BO(0,0) = -1;
  return(TRUE);  
}

void fix_A_type(char *type, char *id, char *res_type)
{
  /* Check for ASP, ASN and ASX */
  /* Check for GLU, GLN and GLX */
  if (EQn(id,"AD1",3) && (EQn(res_type,"AS",2) || EQn(res_type,"N",1)))
    strcpy(type,"O");
  else
    if (EQn(id,"AD2",3) && (EQn(res_type,"AS",2) || EQn(res_type,"N",1)))
      strcpy(type,"N");
  else
    if (EQn(id,"AE1",3) && (EQn(res_type,"GL",2) || EQn(res_type,"Q",1)))
      strcpy(type,"O");
  else
    if (EQn(id,"AE2",3) && (EQn(res_type,"GL",2) || EQn(res_type,"Q",1)))
      strcpy(type,"N");
  else
    if (EQn(id,"AD1",3) && (EQn(res_type,"HIS",3) || EQn(res_type,"H",1)))
      strcpy(type,"N");
  else
    if (EQn(id,"AE1",3) && (EQn(res_type,"HIS",3) || EQn(res_type,"H",1)))
      strcpy(type,"C");
  else
    if (EQn(id,"AE2",3) && (EQn(res_type,"HIS",3) || EQn(res_type,"H",1)))
      strcpy(type,"N");
  else
    if (EQn(id,"AD2",3) && (EQn(res_type,"HIS",3) || EQn(res_type,"H",1)))
      strcpy(type,"C");
}

void process_connect_records(FILE *file1, ums_type *mol)
{
  char pdb_line[BUFF_SIZE];  
  int the_serial_num, conn1, conn2, conn3, conn4;

  while ((fgets(pdb_line,sizeof(pdb_line), file1) != NULL) && NOTEQn(pdb_line,"END",3))
  {
    if EQn(pdb_line,"CONECT",6)
    {
      parse_conect_record(&pdb_line[6], &the_serial_num, &conn1, &conn2,
        &conn3, &conn4);
      if (conn1 > 0) add_bond(mol, the_serial_num, conn1);
      if (conn2 > 0) add_bond(mol, the_serial_num, conn2);
      if (conn3 > 0) add_bond(mol, the_serial_num, conn3);
      if (conn4 > 0) add_bond(mol, the_serial_num, conn4);
    }
  }
}

void add_bond(ums_type *mol, int i, int j)
{
  int k, start, end;
  /*
   * Need to find the atoms with the passed in serial numbers
   */

  if (i < j)
  {
    start = i;
    end = j;
  }
  else
  {
    start = j;
    end = i;
  }

  for (k = 1; k <= Atoms; k++)
  {
    if (SerialNum(k) == start)
    {
      start = k;
      break;
    }
  }
  for (k = start + 1; k <= Atoms; k++)
  {
    if (SerialNum(k) == end)
    {
      end = k;
      break;
    }
  }
  if (ChainNum(start) == ChainNum(end))
  {
    Connection(start,Valence(start)) = end;
    Connection(end,Valence(end)) = start;
    Valence(start)++;
    Valence(end)++;
    Start(Bonds) = start;
    End(Bonds) = end;
    Bonds++;
  }
}

int parse_atom_record(char *pdb_line, int the_atom, ums_type *mol)
{
/* ATOMFORMAT "(i5,1x,a4,a1,a3,1x,a1,i4,a1,3x,3f8.3,2f6.2,1x,i3)" */

  int len;
  int sc;
  char tmp_str[10];
  char *tmp_ptr;

  len = strlen(pdb_line) - 1;
  sc = 0;

  /* serial number */
  my_strncpy(tmp_str, &pdb_line[sc], 5); sc += 6;
  SerialNum(the_atom) = atoi(tmp_str);
  if (sc >= len) return(FALSE);

  /* atom name */
  my_strncpy(tmp_str, &pdb_line[sc], 4); sc += 4;
  strcpy(AtmId(the_atom), tmp_str);
  tmp_ptr = rtrim(tmp_str);
  strcpy(Type(the_atom), tmp_str);
  if (sc >= len) return(FALSE);

  /* alternate location  NOTUSED */
  my_strncpy(tmp_str, &pdb_line[sc], 1); sc += 1;
  if (sc >= len) return(FALSE);

  /* residue name */
  my_strncpy(tmp_str, &pdb_line[sc], 3); sc += 4;
  if ((tmp_str[0] == ' ') && (tmp_str[1] == ' ') && (tmp_str[2] == ' '))
  {
    strcpy(ResName(the_atom), "UNK");
  }
  else
  {
    strcpy(ResName(the_atom), tmp_str);
  }
  if (sc >= len) return(FALSE);

  /* chain ID  NOTUSED */
  my_strncpy(tmp_str, &pdb_line[sc], 1); sc += 1;
  if (sc >= len) return(FALSE);

  /* residue sequence number */
  my_strncpy(tmp_str, &pdb_line[sc], 4); sc += 4;
  ResNum(the_atom) = atoi(tmp_str);
  if (sc >= len) return(FALSE);

  /* residue insertion code NOTUSED */
  my_strncpy(tmp_str, &pdb_line[sc], 1); sc += 4;
  if (sc >= len) return(FALSE);

  /* X, Y, Z */
  my_strncpy(tmp_str, &pdb_line[sc], 8); sc += 8;
  X(the_atom) = atof(tmp_str);
  if (sc >= len) return(FALSE);

  my_strncpy(tmp_str, &pdb_line[sc], 8); sc += 8;
  Y(the_atom) = atof(tmp_str);
  if (sc >= len) return(FALSE);

  my_strncpy(tmp_str, &pdb_line[sc], 8); sc += 8;
  Z(the_atom) = atof(tmp_str);
  if (sc >= len) return(TRUE);

#ifdef COMPLETE_PDB
  /* occupancy NOTUSED */
  my_strncpy(tmp_str, &pdb_line[sc], 6); sc += 6;
  if (sc >= len) return(TRUE);

  /* temperature factor NOTUSED */
  my_strncpy(tmp_str, &pdb_line[sc], 6); sc += 7;
  if (sc >= len) return(TRUE);

  /* footnote number NOTUSED*/
  my_strncpy(tmp_str, &pdb_line[sc], 3);
#endif

  return(TRUE);
}

int
parse_conect_record(char *pdb_line, int *serial_number,
  int *conn1, int *conn2, int *conn3, int *conn4)
{
/* CONECTFORMAT "(5i5)" */
  int len;
  int sc;
  char tmp_str[10];

  len = strlen(pdb_line) - 1;
  sc = 0;
             
  *serial_number = *conn1 = *conn2 = *conn3 = *conn4 = 0;

  my_strncpy(tmp_str, &pdb_line[sc], 5); sc += 5;
  *serial_number = atoi(tmp_str);
  if (sc >= len) return TRUE;
      
  my_strncpy(tmp_str, &pdb_line[sc], 5); sc += 5;
  *conn1 = atoi(tmp_str);
  if (sc >= len) return TRUE;
         
  my_strncpy(tmp_str, &pdb_line[sc], 5); sc += 5;
  *conn2 = atoi(tmp_str);
  if (sc >= len) return TRUE;
    
  my_strncpy(tmp_str, &pdb_line[sc], 5); sc += 5;
  *conn3 = atoi(tmp_str);
  if (sc >= len) return TRUE;
      
  my_strncpy(tmp_str, &pdb_line[sc], 5); sc += 5;
  *conn4 = atoi(tmp_str);
      
  return TRUE;  
}

int
stringToType(int record_type, int the_atom, ums_type *mol)
{
   int ilen;
   int atnum;
   char *tmpstr1;
   char *tmpstr2;
   char *pos;
         
   if ((ilen = (int)strlen ( Type(the_atom) )) == 0) {
      show_warning ( "Error parsing the atom name (length of zero)." );
      return (FALSE);
   }
            
   tmpstr1 = strdup ( Type(the_atom) );
   tmpstr2 = strdup ( Type(the_atom) );
   uppercase ( tmpstr1 );
               
   if (record_type == PDBATOM) {
      /*   
       * In the standard PDB files, the first character will be blank or
       * a digit.  The second character will be the element we are looking
       * for.  There are exceptions to this.  Often D is used for deuterium
       * so we will convert D to H.  For some of the residues, the atom type
       * cannot be determined and the element name will start with A.
       * We will look for these and handle as appropriate.
       */
      ljust ( tmpstr2 );
      ljust ( tmpstr1 );
      pos = tmpstr1;
      while (isdigit(pos[0])) pos++;
      pos[1] = '\0';
      if (pos[0] == 'A') {
         fix_A_type(Type(the_atom), tmpstr2, ResName(the_atom));
         free ( tmpstr1 );
         free ( tmpstr2 );
         return (TRUE);
      }
   } else {  
      /*
       * The HETATM record is more complicated than the ATOM record in terms
       * of determining the type of element from the atom label.
       * We will do the following:
       * If there is no residue name then we assume the atom label is the
       * name of the element.   
       * If the residue name and the atom label are the same then we will
       * assume that the atom label is the name of the element.
       * Generally, if there is a letter in the first column then the
       * atom label represents an element with a 2 letter element name.
       * The exceptions so far to this is when the residue name is
       * ADR, COA, FAD, GPG, NAD, NAL or NDP.  In these cases there is a
       * letter in the first column and the second letter is the element
       * name.
       */  
      pos = tmpstr1;
      /*
       * Check the residue name and if one of those listed above, move
       * the pointer.
       */
      if ((!strcmp ( ResName(the_atom), "ADR" )) ||
          (!strcmp ( ResName(the_atom), "COA" )) ||
          (!strcmp ( ResName(the_atom), "FAD" )) ||
          (!strcmp ( ResName(the_atom), "GPG" )) ||
          (!strcmp ( ResName(the_atom), "NAD" )) ||
          (!strcmp ( ResName(the_atom), "NAL" )) ||
          (!strcmp ( ResName(the_atom), "NDP" ))) {
         pos++;
         pos[1] = '\0';
      } else if (isdigit(pos[0])) {
         pos++;
         pos[1] = '\0';   
      } else if (pos[0] != ' ') {
         pos[2] = '\0';
         if (isalpha(pos[1]) && isupper(pos[1])) pos[1] = tolower(pos[1]);
      } else {
         ljust ( pos );
         pos[1] = '\0';
      }
   }   
   strcpy(Type(the_atom), pos);
   free ( tmpstr1 );
   free ( tmpstr2 );
   return (TRUE);
}

#undef DELIMS
#undef PDBATOM
#undef PDBHETATM
Modified: Tue Jan 21 17:00:00 1997 GMT
Page accessed 7259 times since Sat Apr 17 21:36:47 1999 GMT