Tuesday 19 June 2012

Pig notes to self


Some commands


# Note SUBSTRING is like a python slice
# so suppose field x has "abcdfegh"
# SUBSTRING(x,3,4) => "d"
# SUBSTRING(x,2,5) => "cdef"


Note this code is there for syntax purposes only - it does nothing meaningful ...


comments 


/* .... over multiple lines ...*/


-- use -param arg1='abcd' on the command line
-- use -param myvar='xyz' on the command line
%default arg1 'default value'
%default myvar 'default value'


REGISTER myudf.jar;
REGISTER piggybank.jar;


DEFINE SUBSTRING org.apache.pig.piggybank.evaluation.string.SUBSTRING();
DEFINE LENGTH  org.apache.pig.piggybank.evaluation.string.LENGTH();


my_file = LOAD '$myfile' USING PigStorage('|') AS (col1:chararray, col2:double, col3:long);
my_file = DISTINCT my_file; -- remove duplicates


my_recs = FOREACH my_file GENERATE SUBSTRING(col1,0,14) AS mycol, null AS col4:chararray, (LENGTH(col1) < 3 ? col1 : SUBSTRING(REPLACE(col1,' ',''), 0,LENGTH(REPLACE(col1,' ',''))-2)) AS col5:chararray, col2, col3;


-- CONCAT(myudf.ZeroPad6Left(col1), myudf.ZeroPad6Left(col1)) AS col6:chararray


my_joined = JOIN my_recs by (col1, col2), my_recs by (col1,col2);


my_joined = FILTER my_joined BY (col3 < 1000);


my_joined2 = JOIN my_joined by col1 LEFT OUTER, my_recs by col1;


my_fin_rec = FOREACH my_joined2 GENERATE ;


STORE my_fin_rec INTO '$OUTPUTfile' USING PigStorage('|');

No comments: