Friday 3 June 2011

Teradata data migration from one Teradata system to another using arcmain

Problem

Need to migrate data from Teradata system SRC (i.e. SouRCe) to Teradata system TGT (i.e. TarGeT).

Approach

Create a pair of named pipes called $SRC_FIFO and $TGT_FIFO.
Link these together by cat'ing $SRC_FIFO and outputting into $TGT_FIFO and running in the background like this:

cat $SRC_FIFO > $TGT_FIFO &

The Producer for $SRC_FIFO is the arcmain process extracting data from SRC.
The Consumer for $SRC_FIFO is the cat command above i.e. cat $SRC_FIFO

The Producer for $TGT_FIFO is the input to $TGT_FIFO from the cat command above i.e. > $TGT_FIFO.
The Consumer for $TGT_FIFO is the arcmain process copying data into TGT.

Supporting Files

A single file is required with a list of tables and their associated databases in the SRC system with the following format:

databaseX.tableA
databaseY.tableB
...
databaseZ.tableC

Note in this version - there should not be no whitespace on any line in the file.
In the code, this file is referenced as $ARC_OBJECT_FILE.

Command line

To run the code, use something like:

./mig_test2dev.ksh ARC_OBJ_LIST=/tmp/example_archive_file_list 2>&1 | tee mig_test2dev.log

where the list of objects to be migrated is in the file /tmp/example_archive_file

Code

The code can be found at the end of this post.

Potential Problems

There is a reasonable desire to improve the throughput of both arcmain processes by increasing the SESSIONS=NN argument. I have had "general communication" errors when setting this argument to greater values.

Future improvements

This script was coded quickly and there are parts that need improvements - esp to make it generic for running any environment.
  • Allow whitespace in the file with the list of database tables (see example below)
  • Add break-point to a staging area to support migration across remote sites
    • Add compression support to arcmain procedures
    • Add chunking of arcmain archives (breaking into xGB chunks)
    • Adding copy across a network via SSH
  • Add check to determine whether the db.table already exists in the TGT system
    • If so, allow option of suffixing with date YYYYMMDD or NNN version number
  • GUI - there is already a product to do this - so probably not
Use and Support

Happy for the code to be used (at your own risk).
There is no support for this code - though would be happy to assist if able.

Code Listing


$ cat mig_test2dev.ksh
#!/bin/ksh
# Program: mig_test2dev.ksh
# Description: Teradata data migration script.
# Moves data between TD system SRC (ie SouRCe) and TD system TGT (ie TarGeT)
# Version: 1.0 gml 31-May-2011 Initial version
# ---------------------------------------------------------------------------------------

err_msg() {

echo "ERROR: in $PRG - $1"
echo "usage: mig_test2dev.ksh ARC_OBJ_LIST="
echo "example: ./mig_test2dev.ksh ARC_OBJ_LIST=/tmp/example_archive_file_list 2>&1 | tee mig_test2dev.log"
exit 1

}

# Remove remnant FIFOs
cleanup_fifo() {

if [ -n "$DEBUG" ]
then
set -x
fi

SRC_OR_TGT=$1
# clean up existing ARC_FIFO
if [ -p "${ARC_FIFO}_${SRC_OR_TGT}" ]
then
rm ${ARC_FIFO}_${SRC_OR_TGT}
fi

}

# create the arcmain scripts
create_arcmain_script() {

if [ -n "$DEBUG" ]
then
set -x
fi

SRC_OR_TGT=$1
THE_ARCMAIN_SCRIPT=${ARCMAIN_SCRIPT}_${NOW}_${SRC_OR_TGT}.arc
TMP_ARC_SCRIPT=${ARC_OBJ_LIST}.$$ # To create a temp script file
THE_ARC_FIFO=${ARC_FIFO}_${SRC_OR_TGT} # The FIFO is named in the arcmain script (instead of a filename)

if [ "$SRC_OR_TGT" = "SRC" ]
then
LOGON_STR="${TD_SRC}/${TD_SRC_LOGON}"
ARC_CMD="ARCHIVE"
cat ${ARC_OBJ_LIST} | awk -F'.' ' { printf("(%c%s%c.%c%s%c),\n",34,$1,34,34,$2,34) } ' > ${TMP_ARC_SCRIPT}
else # [ "$SRC_OR_TGT" = "TGT" ]
LOGON_STR="${TD_TGT}/${TD_TGT_LOGON}"
ARC_CMD="COPY"
cat ${ARC_OBJ_LIST} | awk -F'.' -v v_tgt_db=$TGT_DB -v v_tgt_tab_suffix=$TGT_TAB_SUFFIX ' { printf("(%c%s%c.%c%s%s%c) (FROM(%c%s%c.%c%s%c)),\n",34,v_tgt_db,34,34,$2,v_tgt_tab_suffix,34,34,$1,34,34,$2,34) } ' > ${TMP_ARC_SCRIPT}
fi


# build the arcmain_script from the arcmain_script_master
if [ -n "${TMP_ARC_SCRIPT}" ]
then
cat <${THE_ARCMAIN_SCRIPT}
logon ${LOGON_STR};
${ARC_CMD} DATA TABLES
EOF
cat ${TMP_ARC_SCRIPT} >> ${THE_ARCMAIN_SCRIPT}
cat <>${THE_ARCMAIN_SCRIPT}
RELEASE LOCK,
FILE=${THE_ARC_FIFO};
LOGOFF;
EOF

rm ${TMP_ARC_SCRIPT}

else
err_msg "No TMP_ARC_SCRIPT (${TMP_ARC_SCRIPT}) produced"
fi

}

# Main Program

# evaluate any arguments like X=A Y=B ... Z=C
eval $@

# if the DEBUG flag is set, then show the result/impact of every line
if [ -n "$DEBUG" ]
then
set -x
fi

# set up variables
PRG=`basename $0`
NOW=`date +"%Y%m%d%H%M%S"` # used (with $$) to allow multiple executions of this
TD_DEV=10.172.6.41 # Dev TD system
TD_TEST=10.172.6.71 # Test TD system
TD_SRC=${TD_SRC:-${TD_TEST}}
TD_TGT=${TD_TGT:-${TD_DEV}}
TD_SRC_LOGON=${TD_SRC_LOGON:-"ARCUSER1,xxxx"}
TD_TGT_LOGON=${TD_TGT_LOGON:-"ARCUSER1,yyyy"}
TD_SRC_LOGON_STR="${TD_SRC}/${TD_SRC_LOGON}"
TD_TGT_LOGON_STR="${TD_TGT}/${TD_TGT_LOGON}"
TGT_DB=${TGT_DB:-"ST3NF03_DB01"}
TGT_TAB_SUFFIX=${TGT_TAB_SUFFIX:-"001"}  # use suffix to avoid overwriting - needs to be more sophisticated

if [ -z "${ARC_OBJ_LIST}" ]
then
err_msg "The ARC_OBJ_LIST argument is missing"
fi

if [ ! -f "${ARC_OBJ_LIST}" ]
then
err_msg "ARC_OBJ_LIST (${ARC_OBJ_LIST}) file does not exist"
fi

HOME_DIR=/home/glourei1 # my home dir
SCRIPT_DIR=${HOME_DIR}/bin # script dir
BASE_ARC_OBJ_LIST=`basename ${ARC_OBJ_LIST}`
LOCK_FILE=$SCRIPT_DIR/${PRG}_${BASE_ARC_OBJ_LIST}.lck # lock file to ensure only one of these scripts runs at a time

# add - SRC_/TGT_ARC_FIFO_DIR
ARC_FIFO=FIFO
SRC_ARC_FIFO=${ARC_FIFO}_SRC
TGT_ARC_FIFO=${ARC_FIFO}_TGT
ARCMAIN_SCRIPT=$SCRIPT_DIR/arcmain_script
SRC_ARCMAIN_SCRIPT=${ARCMAIN_SCRIPT}_${NOW}_SRC.arc
TGT_ARCMAIN_SCRIPT=${ARCMAIN_SCRIPT}_${NOW}_TGT.arc
#let Teradata decide - was getting errors using high numbers
#SRC_NUM_SESSIONS="SESSIONS=8"
#TGT_NUM_SESSIONS="SESSIONS=4"

# This ensures only one of this processes runs at a time
if [ -f $LOCK_FILE ]
then
echo "ERROR: Already running $PRG"
echo " Protected by the lock file $LOCK_FILE"
exit 1
else
touch $LOCK_FILE
fi

# cleanup legacy fifos if required
cleanup_fifo SRC
cleanup_fifo TGT

# create the SRC and TGT arcmain scripts
create_arcmain_script SRC
create_arcmain_script TGT

# Set up the FIFOs
# arcmain ARCHIVE script writes as the Producer part of the SRC_ARC_FIFO
# which is Consumed by the cat below and written as the Producer to the
# TGT_ARC_FIFO which is Consumed by the arcmain COPY into the Target (TGT) database
mkfifo $SRC_ARC_FIFO
mkfifo $TGT_ARC_FIFO
cat ${SRC_ARC_FIFO} > ${TGT_ARC_FIFO} &

# run the arcmain archive
echo "Starting source arcmain ..."
arcmain $SRC_NUM_SESSIONS <$SRC_ARCMAIN_SCRIPT && SRC_RC=$? && [ $SRC_RC -ne 0 ] && err_msg "arcmain archive failed [SRC_RC=${SRC_RC}]" &
sleep 30

# run the arcmain copy/restore
echo "Starting target arcmain ..."
arcmain $TGT_NUM_SESSIONS <$TGT_ARCMAIN_SCRIPT && TGT_RC=$? && [ $TGT_RC -ne 0 ] && err_msg "arcmain copy/restore failed [TGT_RC=${TGT_RC}]" &

# need to wait till all the background processes have completed
wait

# clean up
rm $LOCK_FILE

# all's well - fin
exit 0

No comments: