CCL Home Page
Up Directory CCL getpdb
#!/bin/ksh
##
## $RCSfile: getpdb,v $
## $Revision: 1.10 $
## $Date: 1995/07/20 15:43:36 $
##
## NAME
## getpdb -- get the current release of the Protein Data Bank
##
## SYNOPSIS
## getpdb [-cc|-help|-k|-n|-v|-version]
##   -cc    use current contents.lis file
##   -help  print help
##   -k     keep temp files
##   -n     dry run
##   -v     verbose ftp
##   -version  print version
##
## DESCRIPTION
##   getpdb is a ksh script which maintains a local mirror of the
## Protein Data Bank. It requires only standard Unix utilities (ftp,
## sed, nawk, cut, zcat).  It tries to get only files which have
## changed since the last invocation of getpdb: new or updated files
## are retrieved and obsolete files are removed.  Local additions are
## not removed.  Any files which were removed locally are replaced.
## Retrieved files are decompressed, trimmed of the filler text, and
## renamed (e.g., pdb1crn.ent.Z becomes 1crn.pdb).
##   The mechanism used to determine added, deleted, or changed files
## is simplistic and is based on a textual comparison of the
## contents.lis file from the current and previous invocations of
## getpdb.  This list is subsequently augmented by any files which are in
## the pdb distribution but are not in the pdb directory.
##
## CAVEATS & BUGS
## 1) There's currently no facility to get only parts of the PDB.  I'll do
##    this later.
## 2) Errors aren't handled very gracefully.  In particular, the ftp
##    process is assumed to succeed.  Any unreceived files /will/ be
##    shown as missing, but they'll not be reattempted.  The list of
##    local files ($PDBDIR/$CONTENTSfn) will be updated anyway and
##    become out of sync with the local files. Rerunning getpdb should
##    get the missing files.
## 3) If someone else owns $TMPDIR/contents.lis, ftp will fail.  You'll
##    get a message like "$TMPDIR/contents.lis: not owner".  getpdb will
##    use the existing version, which might be out of date.  I'll fix this
##    later
## 
## EXAMPLES
## $ getpdb -v -n -k
##   prints which files would be retrieved and removed and keeps the
##   contents.lis in $TMPDIR.
## $ getpdb -v -cc > getpdb-log 2>&1 &
##   performs the retrieval using the contents.lis which was preserved
##   from the first example, and logs to getpdb-log (sh syntax).
##   contents.lis is retained upon completion.
##
## REQUIREMENTS AND COMPATIBILITY
## - ksh-compatible shell (pdksh, bash should work)
## - standard unix utilities (comm, sed, nawk, zcat, cat, etc.)
## - >1.1Gb disk space
## - Tested platforms:
##   alpha-dec-osf13.2
##   mips-sgi-irix5.3
## 
## INSTALLATION
## 1) Put in a convenient place.  Your pdb directory is a good choice.
## 2) Make it executable (e.g., chmod 755 getpdb)
## 2) Edit this file below as specified, or set the environment variables
##    appropriately in your shell.
##
## AVAILABILITY
## New versions of this file may be obtained from
## http://dasher.wustl.edu/~reece/src/getpdb
## ftp://dasher.wustl.edu/pub/getpdb/
## 
## AUTHOR
## Reece Kimball Hart                 |email: reece@dasher.wustl.edu
## Biophysics & Biochemistry, Box 8231|WWW:   http://dasher.wustl.edu/~reece/
## Washington Univ. School of Medicine|Phone: (314) 362-4198 (lab)
## 660 South Euclid                   |                -7183 (fax)
## St. Louis, Missouri  63110    (USA)|PGP public key available by finger & WWW
##
## LICENSE
## This source code is hereby released to the public domain.  You are
## encouraged to copy and modify this file.  Please clearly document the
## source and reason for modifications.  Bug reports, code contributions,
## and suggestions are appreciated.
## 


###############################################################################
##                                                                           ##
##             YOU MAY NEED TO CHANGE THE FOLLOWING FOR YOUR SITE            ##
##                                                                           ##
###############################################################################

# PDBDIR is the local pdb coordinate directory.  Needs ~1.2Gb as of 950606.
PDBDIR=${PDBDIR:-/data/pdb/coords}

# TMPDIR is where you'd like temporary files stored; it defaults to /var/tmp
TMPDIR=${TMPDIR:-/var/tmp}

# FTPPASS is your email address
FTPPASS=${FTPPASS:-${USER}@`hostname`}

# PATH is a colon delimited list of directories
# I do this to avoid user's own homegrown programs
PATH=/sbin:/bin:/usr/bsd


###############################################################################
##                                                                           ##
##         YOU SHOULD NOT NEED TO CHANGE ANYTHING BEYOND THIS POINT          ##
##                                                                           ##
###############################################################################

## tmpfn
# imported from http://dasher.wustl.edu/~reece/src/tmpfn
tmpfn ()
    {
    while [ $# -gt 0 ]
    do
        case $1 in
            -p) shift; PREFIX=${1##*/}; shift;;
            -d) shift; TMPDIR=$1; shift;;
            -c) TOUCHIT=TRUE; shift;;
            *)
                echo "usage: ${0##*/} [-p prefix | -d tempdir | -c]"; exit 1;;
        esac
    done

    [ -n "$TMPDIR" ] || TMPDIR=/tmp
    [ -n "$PREFIX" ] || PREFIX=tmpfn

    if [ -d $TMPDIR -a -w $TMPDIR ]
    then
        FN=""
        until [ ! -a ${FN} ]
        do
            FN=${TMPDIR}/${PREFIX}-$RANDOM
        done
        [ -n "$TOUCHIT" ] && touch $FN
        echo $FN
        exit 0
    else
        echo "$0: FATAL: directory $TMPDIR doesn't exist or isn't writable"
        exit 1
    fi
    }

## ftpscript
# This function takes files to get on stdin.  It's the skeleton of
# http://dasher.wustl.edu/~reece/src/ftpscript.
ftpscript ()
    # args: [-v] FTPHOST FTPLOGIN FTPPASS
    {
    if [ $1 = -v ]; then VERBOSE=-v; shift; fi

    TMPfn=`tmpfn`
    cat << EOF > $TMPfn
open $1
user $2 $3
EOF
    cat >> $TMPfn
    ftp -n $VERBOSE < $TMPfn
    rm -f $TMPfn
    }


## initialize some variables
RCSId="\$Id: getpdb,v 1.10 1995/07/20 15:43:36 reece Exp $"
FTPHOST=ftp.pdb.bnl.gov
FTPLOGIN=anonymous
RMTPDBDIR=all_entries
CONTENTSfn=contents.lis
STATUSfn=files.list
APPNAME=${0##*/}

LOCALCONTENTSfn=${TMPDIR}/${CONTENTSfn}
THEIRSTATUSffn=`tmpfn -p theirstatus`
OURSTATUSffn=${TMPDIR}/${STATUSfn}
TOGETffn=`tmpfn -p to-get`
TODELETEffn=`tmpfn -p to-delete`

CURRENTCONTENTS=FALSE
DIDSOMETHING=FALSE
VERBOSE=
DRYRUN=FALSE
KEEPTEMPS=FALSE

## make sure we have a writable tmp directory
if ! [ -d ${TMPDIR} -a -r ${TMPDIR} -a -w ${TMPDIR} ]
then
    echo "${APPNAME}: temporary directory ${TMPDIR} doesn't exist or isn't readable and writeable"
    exit 1
fi

## parse the command line
while [ $# -ge 1 ]
do
    case $1 in
        -n) DRYRUN=TRUE; shift;;
        -cc) CURRENTCONTENTS=TRUE; shift;;
        -k) KEEPTEMPS=TRUE; shift;;
        -v) VERBOSE=-v; shift;;
        -help) sed -n -e "2,/^$/p" $0 | more; exit 0;;
        -version) echo "$RCSId"; exit 0;;
        *) echo "${APPNAME}: $1: flag not recognized.  Try -help."; exit 1;;
    esac
done

## make sure PDBDIR is a directory and is writable
if [ ! -d ${PDBDIR} -o ! -r ${PDBDIR} ]
then
    echo "${APPNAME}: FATAL: directory ${PDBDIR} doesn't exist or isn't readable."
    exit 1
else
    if [ ${DRYRUN} = FALSE ] && ! [ -w ${PDBDIR} ]
    then
        echo "${APPNAME}: FATAL: ${PDBDIR} isn't writable."
        exit 1
    fi
fi

cd ${PDBDIR}

# get the contents.lis by ftp, or use the current one if so directed
if [ ${CURRENTCONTENTS} = TRUE ]
then
    # try to use an existing contents.lis file
    if [ -f ${LOCALCONTENTSfn} ]
    then
        echo "${APPNAME}: Using current ${LOCALCONTENTSfn}."
    else
        echo "${APPNAME}: FATAL: -cc specified and ${LOCALCONTENTSfn} not found."
        exit 1
    fi
else
    # get the current contents (a ls -l listing) from the pdb server
    # This assumes that contents.lis is essentially an ls -l on all_entries.
    # If it's not, a command like "ls -l all_entries contents.lis" would
    # more appropriate (that's untested).
    echo "${APPNAME}: getting ${CONTENTSfn} from ${FTPHOST}..."
    echo "get all_entries/${CONTENTSfn} ${LOCALCONTENTSfn}" | ftpscript $VERBOSE ${FTPHOST} ${FTPLOGIN} "${FTPPASS}"
fi


# hack the contents file to create a file in the following format:
# :   (e.g., "108d: 530215 Jun  3 07:11")
# where  is 4 chars,  is 7 chars,  is 12 chars
# this is the file we store when all's complete
grep "\.ent\.Z$" ${LOCALCONTENTSfn} \
| cut -c26- \
| sed -e "s/\(.*\) pdb\(.*\)\.ent\.Z$/\2:\1/g" \
| sort \
> ${THEIRSTATUSffn}
if [ $? -ne 0 ]
then
    echo "${APPNAME}: FATAL: coulnd't open ${LOCALCONTENTSfn}!"
    exit 1
fi


# $STATUSfn is the list of files we got in a previous session
# touch it to create an empty file if it doesn't exist
if [ -f ${PDBDIR}/${STATUSfn} ]
then
    ln -fs ${PDBDIR}/${STATUSfn} ${OURSTATUSffn}
else
    touch ${OURSTATUSffn}
fi


# we've now got two files in $TMPDIR.  One's the (possibly empty) status file,
# which contains all of the files we've already downloaded.  The other is
# a listing of the pdb ftp server's contents.  We'll do a complex series of
# comms to determine what's out of date, what's missing, and what's obsolete.

# uncomment the following to test on first 10 entries in ${LOCALCONTENTSfn}
# head ${THEIRSTATUSffn} > ${THEIRSTATUSffn}top && mv ${THEIRSTATUSffn}top ${THEIRSTATUSffn}

# get files which are new additions or updates to existing files
comm -23 ${THEIRSTATUSffn} ${OURSTATUSffn} | cut -f1 -d: > ${TOGETffn}

# and any which didn't change between editions and don't exist locally
comm -12 ${THEIRSTATUSffn} ${OURSTATUSffn} | cut -f1 -d: | nawk -v FS=: '{print "[ -f "$1".pdb ] || echo "$1}' | sh >> ${TOGETffn}

# to-get must be sorted in order to remove the from the to-delete list
sort ${TOGETffn} > ${TOGETffn}.tmp && mv -f ${TOGETffn}.tmp ${TOGETffn}

# get the files which were removed between editions
# this doesn't remove local additions
comm -13 ${THEIRSTATUSffn} ${OURSTATUSffn} | cut -f1 -d: | comm -23 - ${TOGETffn} > ${TODELETEffn}


# do the deletions first to make space
if [ -s ${TODELETEffn} ]
then
    if [ ${DRYRUN} = TRUE ]
    then
        echo "${APPNAME}: You need to remove: "
        nawk -v FS=: '{print $1".pdb"}' ${TODELETEffn} | paste - - - - -
    else
        # remove obsolete files
        echo "${APPNAME}: Removing obsolete files..."
        nawk -v FS=: '{print $1".pdb"}' ${TODELETEffn} | xargs -t rm -f
        DIDSOMETHING=TRUE
    fi
else
    echo "${APPNAME}: No deletions from ${PDBDIR}/ were required"
fi


# and now do the retrievals
if [ -s ${TOGETffn} ]
then
    if [ ${DRYRUN} = TRUE ]
    then
        echo "${APPNAME}: You need to get: "
        nawk -v FS=: '{print $1".pdb"}' ${TOGETffn} | paste - - - - -
    else
        echo "${APPNAME}: Getting new, missing, and updated files..."
        nawk -v FS=: 'BEGIN {print "binary"} {print "get all_entries/compressed_files/pdb"$1".ent.Z pdb"$1".ent.Z"}' ${TOGETffn} | ftpscript ${VERBOSE} ${FTPHOST} ${FTPLOGIN} "${FTPPASS}"

        # decompress and strip junk from end of files
        echo "${APPNAME}: Decompressing and stripping files..."
        for fn in `ls pdb*.ent.Z 2>/dev/null`
        do
            [ $VERBOSE ] && echo "${APPNAME}: processing $fn"
            root=${fn#pdb}; root=${root%\.ent\.Z}
            zcat $fn | cut -c 1-70 | sed -e 's/[ ]*$//g' > ${root}.pdb && rm -f $fn
        done
        DIDSOMETHING=TRUE
    fi
else
    echo "${APPNAME}: No additions to ${PDBDIR}/ were required"
fi


if [ ${DIDSOMETHING} = TRUE ]
then
    # double check: make sure everything's there that should be
    TMPfn=`tmpfn`
    nawk -v FS=: '{print "[ -f "$1".pdb ] || echo "$1".pdb"}' ${THEIRSTATUSffn} | sh > ${TMPfn}
    if [ -s ${TMPfn} ]
    then
        echo "${APPNAME}: The following should be in ${PDBDIR}/ but weren't found:"
        paste - - - - - < ${TMPfn}
        echo "${APPNAME}: You should try rerunning getpdb to get these files."
    fi
    rm -f ${TMPfn}

    # preserve the current state
    cp ${THEIRSTATUSffn} ${STATUSfn} || echo "${APPNAME}: WARNING: couldn't preserve current state in ${STATUSfn}"
    rm -f ${THEIRSTATUSffn}
fi


# clean up
if [ ${KEEPTEMPS} = FALSE ]
then
    rm -f ${TODELETEffn} ${TOGETffn} ${OURSTATUSffn} ${THEIRSTATUSffn}
    if [ ${CURRENTCONTENTS} = FALSE ]
    then
        rm -f ${LOCALCONTENTSfn}
    fi
fi


## $Log: getpdb,v $
## Revision 1.10  1995/07/20  15:43:36  reece
## * PATH set explicitly
## * now uses xargs for removing files
## * fixed small bug in stripping files and uses zcat instead of gunzip -c
## * bug fix: mistakenly updated files.list when files needed to be removed,
##   none needed getting, and -n was given.
##
## Revision 1.9  1995/07/19  21:10:04  reece
## Major update
## * ftpscript and tmpfn now local; getpdb is now independent of other scripts
## * hard-wired names only when necessary; tmp files are generated as needed
## * numerous small aesthetic and execution changes
##
## Revision 1.8  1995/06/07  21:47:26  reece
## * added -help, -version, -v flags
## * improved documentation, added installation instructions
## * documented user-specifiable variables
##
## Revision 1.7  1995/06/07  18:38:17  reece
## * fixed bug in which a file might be both obtained and removed
## * removes files before retrieving
## * temp files now in /var/tmp
## * -k flag to keep temp files
## * internal representation of file and date changed; comparison and
##   status files now use only filename, size, and date.  Comparison is
##   no longer thrown off by insignificant changes in contents.lis (e.g.,
##   changes in permissions, owner, etc.)
## * added final check to ensure everything's intact
##
## Revision 1.6  1995/05/23  19:32:01  reece
## replaced "ftpscript" and "tmpfn" with variable references to these
## programs
##
## Revision 1.1  1995/04/26  03:03:21  reece
## now uses ftpscript
## -n flag for DRYRUN to see what needs to be done
##
## Revision 1.0  1995/04/25  20:16:46  reece
## Initial revision
##
Modified: Sat Jul 22 16:00:00 1995 GMT
Page accessed 486 times since Tue Mar 15 01:01:13 2005 GMT