|
#!/bin/ksh
##
## $RCSfile: getpdb,v $
## $Revision: 1.10 $
## $Date: 1995/07/20 15:43:36 $
##
## NAME
## getpdb -- get the current release of the Protein Data Bank
##
## SYNOPSIS
## getpdb [-cc|-help|-k|-n|-v|-version]
## -cc use current contents.lis file
## -help print help
## -k keep temp files
## -n dry run
## -v verbose ftp
## -version print version
##
## DESCRIPTION
## getpdb is a ksh script which maintains a local mirror of the
## Protein Data Bank. It requires only standard Unix utilities (ftp,
## sed, nawk, cut, zcat). It tries to get only files which have
## changed since the last invocation of getpdb: new or updated files
## are retrieved and obsolete files are removed. Local additions are
## not removed. Any files which were removed locally are replaced.
## Retrieved files are decompressed, trimmed of the filler text, and
## renamed (e.g., pdb1crn.ent.Z becomes 1crn.pdb).
## The mechanism used to determine added, deleted, or changed files
## is simplistic and is based on a textual comparison of the
## contents.lis file from the current and previous invocations of
## getpdb. This list is subsequently augmented by any files which are in
## the pdb distribution but are not in the pdb directory.
##
## CAVEATS & BUGS
## 1) There's currently no facility to get only parts of the PDB. I'll do
## this later.
## 2) Errors aren't handled very gracefully. In particular, the ftp
## process is assumed to succeed. Any unreceived files /will/ be
## shown as missing, but they'll not be reattempted. The list of
## local files ($PDBDIR/$CONTENTSfn) will be updated anyway and
## become out of sync with the local files. Rerunning getpdb should
## get the missing files.
## 3) If someone else owns $TMPDIR/contents.lis, ftp will fail. You'll
## get a message like "$TMPDIR/contents.lis: not owner". getpdb will
## use the existing version, which might be out of date. I'll fix this
## later
##
## EXAMPLES
## $ getpdb -v -n -k
## prints which files would be retrieved and removed and keeps the
## contents.lis in $TMPDIR.
## $ getpdb -v -cc > getpdb-log 2>&1 &
## performs the retrieval using the contents.lis which was preserved
## from the first example, and logs to getpdb-log (sh syntax).
## contents.lis is retained upon completion.
##
## REQUIREMENTS AND COMPATIBILITY
## - ksh-compatible shell (pdksh, bash should work)
## - standard unix utilities (comm, sed, nawk, zcat, cat, etc.)
## - >1.1Gb disk space
## - Tested platforms:
## alpha-dec-osf13.2
## mips-sgi-irix5.3
##
## INSTALLATION
## 1) Put in a convenient place. Your pdb directory is a good choice.
## 2) Make it executable (e.g., chmod 755 getpdb)
## 2) Edit this file below as specified, or set the environment variables
## appropriately in your shell.
##
## AVAILABILITY
## New versions of this file may be obtained from
## http://dasher.wustl.edu/~reece/src/getpdb
## ftp://dasher.wustl.edu/pub/getpdb/
##
## AUTHOR
## Reece Kimball Hart |email: reece@dasher.wustl.edu
## Biophysics & Biochemistry, Box 8231|WWW: http://dasher.wustl.edu/~reece/
## Washington Univ. School of Medicine|Phone: (314) 362-4198 (lab)
## 660 South Euclid | -7183 (fax)
## St. Louis, Missouri 63110 (USA)|PGP public key available by finger & WWW
##
## LICENSE
## This source code is hereby released to the public domain. You are
## encouraged to copy and modify this file. Please clearly document the
## source and reason for modifications. Bug reports, code contributions,
## and suggestions are appreciated.
##
###############################################################################
## ##
## YOU MAY NEED TO CHANGE THE FOLLOWING FOR YOUR SITE ##
## ##
###############################################################################
# PDBDIR is the local pdb coordinate directory. Needs ~1.2Gb as of 950606.
PDBDIR=${PDBDIR:-/data/pdb/coords}
# TMPDIR is where you'd like temporary files stored; it defaults to /var/tmp
TMPDIR=${TMPDIR:-/var/tmp}
# FTPPASS is your email address
FTPPASS=${FTPPASS:-${USER}@`hostname`}
# PATH is a colon delimited list of directories
# I do this to avoid user's own homegrown programs
PATH=/sbin:/bin:/usr/bsd
###############################################################################
## ##
## YOU SHOULD NOT NEED TO CHANGE ANYTHING BEYOND THIS POINT ##
## ##
###############################################################################
## tmpfn
# imported from http://dasher.wustl.edu/~reece/src/tmpfn
tmpfn ()
{
while [ $# -gt 0 ]
do
case $1 in
-p) shift; PREFIX=${1##*/}; shift;;
-d) shift; TMPDIR=$1; shift;;
-c) TOUCHIT=TRUE; shift;;
*)
echo "usage: ${0##*/} [-p prefix | -d tempdir | -c]"; exit 1;;
esac
done
[ -n "$TMPDIR" ] || TMPDIR=/tmp
[ -n "$PREFIX" ] || PREFIX=tmpfn
if [ -d $TMPDIR -a -w $TMPDIR ]
then
FN=""
until [ ! -a ${FN} ]
do
FN=${TMPDIR}/${PREFIX}-$RANDOM
done
[ -n "$TOUCHIT" ] && touch $FN
echo $FN
exit 0
else
echo "$0: FATAL: directory $TMPDIR doesn't exist or isn't writable"
exit 1
fi
}
## ftpscript
# This function takes files to get on stdin. It's the skeleton of
# http://dasher.wustl.edu/~reece/src/ftpscript.
ftpscript ()
# args: [-v] FTPHOST FTPLOGIN FTPPASS
{
if [ $1 = -v ]; then VERBOSE=-v; shift; fi
TMPfn=`tmpfn`
cat << EOF > $TMPfn
open $1
user $2 $3
EOF
cat >> $TMPfn
ftp -n $VERBOSE < $TMPfn
rm -f $TMPfn
}
## initialize some variables
RCSId="\$Id: getpdb,v 1.10 1995/07/20 15:43:36 reece Exp $"
FTPHOST=ftp.pdb.bnl.gov
FTPLOGIN=anonymous
RMTPDBDIR=all_entries
CONTENTSfn=contents.lis
STATUSfn=files.list
APPNAME=${0##*/}
LOCALCONTENTSfn=${TMPDIR}/${CONTENTSfn}
THEIRSTATUSffn=`tmpfn -p theirstatus`
OURSTATUSffn=${TMPDIR}/${STATUSfn}
TOGETffn=`tmpfn -p to-get`
TODELETEffn=`tmpfn -p to-delete`
CURRENTCONTENTS=FALSE
DIDSOMETHING=FALSE
VERBOSE=
DRYRUN=FALSE
KEEPTEMPS=FALSE
## make sure we have a writable tmp directory
if ! [ -d ${TMPDIR} -a -r ${TMPDIR} -a -w ${TMPDIR} ]
then
echo "${APPNAME}: temporary directory ${TMPDIR} doesn't exist or isn't readable and writeable"
exit 1
fi
## parse the command line
while [ $# -ge 1 ]
do
case $1 in
-n) DRYRUN=TRUE; shift;;
-cc) CURRENTCONTENTS=TRUE; shift;;
-k) KEEPTEMPS=TRUE; shift;;
-v) VERBOSE=-v; shift;;
-help) sed -n -e "2,/^$/p" $0 | more; exit 0;;
-version) echo "$RCSId"; exit 0;;
*) echo "${APPNAME}: $1: flag not recognized. Try -help."; exit 1;;
esac
done
## make sure PDBDIR is a directory and is writable
if [ ! -d ${PDBDIR} -o ! -r ${PDBDIR} ]
then
echo "${APPNAME}: FATAL: directory ${PDBDIR} doesn't exist or isn't readable."
exit 1
else
if [ ${DRYRUN} = FALSE ] && ! [ -w ${PDBDIR} ]
then
echo "${APPNAME}: FATAL: ${PDBDIR} isn't writable."
exit 1
fi
fi
cd ${PDBDIR}
# get the contents.lis by ftp, or use the current one if so directed
if [ ${CURRENTCONTENTS} = TRUE ]
then
# try to use an existing contents.lis file
if [ -f ${LOCALCONTENTSfn} ]
then
echo "${APPNAME}: Using current ${LOCALCONTENTSfn}."
else
echo "${APPNAME}: FATAL: -cc specified and ${LOCALCONTENTSfn} not found."
exit 1
fi
else
# get the current contents (a ls -l listing) from the pdb server
# This assumes that contents.lis is essentially an ls -l on all_entries.
# If it's not, a command like "ls -l all_entries contents.lis" would
# more appropriate (that's untested).
echo "${APPNAME}: getting ${CONTENTSfn} from ${FTPHOST}..."
echo "get all_entries/${CONTENTSfn} ${LOCALCONTENTSfn}" | ftpscript $VERBOSE ${FTPHOST} ${FTPLOGIN} "${FTPPASS}"
fi
# hack the contents file to create a file in the following format:
# : (e.g., "108d: 530215 Jun 3 07:11")
# where is 4 chars, is 7 chars, is 12 chars
# this is the file we store when all's complete
grep "\.ent\.Z$" ${LOCALCONTENTSfn} \
| cut -c26- \
| sed -e "s/\(.*\) pdb\(.*\)\.ent\.Z$/\2:\1/g" \
| sort \
> ${THEIRSTATUSffn}
if [ $? -ne 0 ]
then
echo "${APPNAME}: FATAL: coulnd't open ${LOCALCONTENTSfn}!"
exit 1
fi
# $STATUSfn is the list of files we got in a previous session
# touch it to create an empty file if it doesn't exist
if [ -f ${PDBDIR}/${STATUSfn} ]
then
ln -fs ${PDBDIR}/${STATUSfn} ${OURSTATUSffn}
else
touch ${OURSTATUSffn}
fi
# we've now got two files in $TMPDIR. One's the (possibly empty) status file,
# which contains all of the files we've already downloaded. The other is
# a listing of the pdb ftp server's contents. We'll do a complex series of
# comms to determine what's out of date, what's missing, and what's obsolete.
# uncomment the following to test on first 10 entries in ${LOCALCONTENTSfn}
# head ${THEIRSTATUSffn} > ${THEIRSTATUSffn}top && mv ${THEIRSTATUSffn}top ${THEIRSTATUSffn}
# get files which are new additions or updates to existing files
comm -23 ${THEIRSTATUSffn} ${OURSTATUSffn} | cut -f1 -d: > ${TOGETffn}
# and any which didn't change between editions and don't exist locally
comm -12 ${THEIRSTATUSffn} ${OURSTATUSffn} | cut -f1 -d: | nawk -v FS=: '{print "[ -f "$1".pdb ] || echo "$1}' | sh >> ${TOGETffn}
# to-get must be sorted in order to remove the from the to-delete list
sort ${TOGETffn} > ${TOGETffn}.tmp && mv -f ${TOGETffn}.tmp ${TOGETffn}
# get the files which were removed between editions
# this doesn't remove local additions
comm -13 ${THEIRSTATUSffn} ${OURSTATUSffn} | cut -f1 -d: | comm -23 - ${TOGETffn} > ${TODELETEffn}
# do the deletions first to make space
if [ -s ${TODELETEffn} ]
then
if [ ${DRYRUN} = TRUE ]
then
echo "${APPNAME}: You need to remove: "
nawk -v FS=: '{print $1".pdb"}' ${TODELETEffn} | paste - - - - -
else
# remove obsolete files
echo "${APPNAME}: Removing obsolete files..."
nawk -v FS=: '{print $1".pdb"}' ${TODELETEffn} | xargs -t rm -f
DIDSOMETHING=TRUE
fi
else
echo "${APPNAME}: No deletions from ${PDBDIR}/ were required"
fi
# and now do the retrievals
if [ -s ${TOGETffn} ]
then
if [ ${DRYRUN} = TRUE ]
then
echo "${APPNAME}: You need to get: "
nawk -v FS=: '{print $1".pdb"}' ${TOGETffn} | paste - - - - -
else
echo "${APPNAME}: Getting new, missing, and updated files..."
nawk -v FS=: 'BEGIN {print "binary"} {print "get all_entries/compressed_files/pdb"$1".ent.Z pdb"$1".ent.Z"}' ${TOGETffn} | ftpscript ${VERBOSE} ${FTPHOST} ${FTPLOGIN} "${FTPPASS}"
# decompress and strip junk from end of files
echo "${APPNAME}: Decompressing and stripping files..."
for fn in `ls pdb*.ent.Z 2>/dev/null`
do
[ $VERBOSE ] && echo "${APPNAME}: processing $fn"
root=${fn#pdb}; root=${root%\.ent\.Z}
zcat $fn | cut -c 1-70 | sed -e 's/[ ]*$//g' > ${root}.pdb && rm -f $fn
done
DIDSOMETHING=TRUE
fi
else
echo "${APPNAME}: No additions to ${PDBDIR}/ were required"
fi
if [ ${DIDSOMETHING} = TRUE ]
then
# double check: make sure everything's there that should be
TMPfn=`tmpfn`
nawk -v FS=: '{print "[ -f "$1".pdb ] || echo "$1".pdb"}' ${THEIRSTATUSffn} | sh > ${TMPfn}
if [ -s ${TMPfn} ]
then
echo "${APPNAME}: The following should be in ${PDBDIR}/ but weren't found:"
paste - - - - - < ${TMPfn}
echo "${APPNAME}: You should try rerunning getpdb to get these files."
fi
rm -f ${TMPfn}
# preserve the current state
cp ${THEIRSTATUSffn} ${STATUSfn} || echo "${APPNAME}: WARNING: couldn't preserve current state in ${STATUSfn}"
rm -f ${THEIRSTATUSffn}
fi
# clean up
if [ ${KEEPTEMPS} = FALSE ]
then
rm -f ${TODELETEffn} ${TOGETffn} ${OURSTATUSffn} ${THEIRSTATUSffn}
if [ ${CURRENTCONTENTS} = FALSE ]
then
rm -f ${LOCALCONTENTSfn}
fi
fi
## $Log: getpdb,v $
## Revision 1.10 1995/07/20 15:43:36 reece
## * PATH set explicitly
## * now uses xargs for removing files
## * fixed small bug in stripping files and uses zcat instead of gunzip -c
## * bug fix: mistakenly updated files.list when files needed to be removed,
## none needed getting, and -n was given.
##
## Revision 1.9 1995/07/19 21:10:04 reece
## Major update
## * ftpscript and tmpfn now local; getpdb is now independent of other scripts
## * hard-wired names only when necessary; tmp files are generated as needed
## * numerous small aesthetic and execution changes
##
## Revision 1.8 1995/06/07 21:47:26 reece
## * added -help, -version, -v flags
## * improved documentation, added installation instructions
## * documented user-specifiable variables
##
## Revision 1.7 1995/06/07 18:38:17 reece
## * fixed bug in which a file might be both obtained and removed
## * removes files before retrieving
## * temp files now in /var/tmp
## * -k flag to keep temp files
## * internal representation of file and date changed; comparison and
## status files now use only filename, size, and date. Comparison is
## no longer thrown off by insignificant changes in contents.lis (e.g.,
## changes in permissions, owner, etc.)
## * added final check to ensure everything's intact
##
## Revision 1.6 1995/05/23 19:32:01 reece
## replaced "ftpscript" and "tmpfn" with variable references to these
## programs
##
## Revision 1.1 1995/04/26 03:03:21 reece
## now uses ftpscript
## -n flag for DRYRUN to see what needs to be done
##
## Revision 1.0 1995/04/25 20:16:46 reece
## Initial revision
##
|