Tuesday, 19 June 2012

Pig notes to self

Some commands

# Note SUBSTRING is like a python slice
# so suppose field x has "abcdfegh"
# SUBSTRING(x,3,4) => "d"
# SUBSTRING(x,2,5) => "cdef"

Note this code is there for syntax purposes only - it does nothing meaningful ...


/* .... over multiple lines ...*/

-- use -param arg1='abcd' on the command line
-- use -param myvar='xyz' on the command line
%default arg1 'default value'
%default myvar 'default value'

REGISTER myudf.jar;
REGISTER piggybank.jar;

DEFINE SUBSTRING org.apache.pig.piggybank.evaluation.string.SUBSTRING();
DEFINE LENGTH  org.apache.pig.piggybank.evaluation.string.LENGTH();

my_file = LOAD '$myfile' USING PigStorage('|') AS (col1:chararray, col2:double, col3:long);
my_file = DISTINCT my_file; -- remove duplicates

my_recs = FOREACH my_file GENERATE SUBSTRING(col1,0,14) AS mycol, null AS col4:chararray, (LENGTH(col1) < 3 ? col1 : SUBSTRING(REPLACE(col1,' ',''), 0,LENGTH(REPLACE(col1,' ',''))-2)) AS col5:chararray, col2, col3;

-- CONCAT(myudf.ZeroPad6Left(col1), myudf.ZeroPad6Left(col1)) AS col6:chararray

my_joined = JOIN my_recs by (col1, col2), my_recs by (col1,col2);

my_joined = FILTER my_joined BY (col3 < 1000);

my_joined2 = JOIN my_joined by col1 LEFT OUTER, my_recs by col1;

my_fin_rec = FOREACH my_joined2 GENERATE ;

STORE my_fin_rec INTO '$OUTPUTfile' USING PigStorage('|');

No comments: