Some commands
# Note SUBSTRING is like a python slice
# so suppose field x has "abcdfegh"
# SUBSTRING(x,3,4) => "d"
# SUBSTRING(x,2,5) => "cdef"
Note this code is there for syntax purposes only - it does nothing meaningful ...
comments
/* .... over multiple lines ...*/
-- use -param arg1='abcd' on the command line
-- use -param myvar='xyz' on the command line
%default arg1 'default value'
%default myvar 'default value'
REGISTER myudf.jar;
REGISTER piggybank.jar;
DEFINE SUBSTRING org.apache.pig.piggybank.evaluation.string.SUBSTRING();
DEFINE LENGTH org.apache.pig.piggybank.evaluation.string.LENGTH();
my_file = LOAD '$myfile' USING PigStorage('|') AS (col1:chararray, col2:double, col3:long);
my_file = DISTINCT my_file; -- remove duplicates
my_recs = FOREACH my_file GENERATE SUBSTRING(col1,0,14) AS mycol, null AS col4:chararray, (LENGTH(col1) < 3 ? col1 : SUBSTRING(REPLACE(col1,' ',''), 0,LENGTH(REPLACE(col1,' ',''))-2)) AS col5:chararray, col2, col3;
-- CONCAT(myudf.ZeroPad6Left(col1), myudf.ZeroPad6Left(col1)) AS col6:chararray
my_joined = JOIN my_recs by (col1, col2), my_recs by (col1,col2);
my_joined = FILTER my_joined BY (col3 < 1000);
my_joined2 = JOIN my_joined by col1 LEFT OUTER, my_recs by col1;
my_fin_rec = FOREACH my_joined2 GENERATE
STORE my_fin_rec INTO '$OUTPUTfile' USING PigStorage('|');
No comments:
Post a Comment