-
Notifications
You must be signed in to change notification settings - Fork 13
/
MBADataPrep.pig
31 lines (29 loc) · 1.48 KB
/
MBADataPrep.pig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
--Clean-up HDFS
rmf /user/admin/retail/marketbaskets
-- Loading raw data
InputFile = LOAD '/user/admin/retail/retailsalesraw/OnlineRetail.txt' using PigStorage('\t')
as ( InvoiceNo: int,
StockCode: chararray,
Description: chararray,
Quantity: int,
InvoiceDate: chararray,
UnitPrice: float,
CustomerID: int,
Country: chararray);
BasketsRawN = FILTER InputFile BY (InvoiceNo is not null OR StockCode is not null);
BasketsRawE = FILTER BasketsRawN BY NOT (StockCode == ' '
OR StockCode == 'DOT'
OR StockCode == 'POST'
OR StockCode == 'BANK'
OR StockCode == 'M'
);
BasketsRaw = FOREACH BasketsRawE GENERATE InvoiceNo,
SUBSTRING(StockCode,0,4) as (StockCat:chararray);
BasketsRaw1 = FILTER BasketsRaw BY NOT ( StockCat == '8509');
BasketsRawU = DISTINCT BasketsRaw1;
BasketsGroupR1 = GROUP BasketsRawU by InvoiceNo;
BasketsGroupR2 = FOREACH BasketsGroupR1 GENERATE group as IN, BasketsRawU as Bkt;
BasketsGroupR3 = FILTER BasketsGroupR2 BY SIZE(Bkt) > 1 AND SIZE(Bkt) < 10;
MarketBaskets = FOREACH BasketsGroupR3 GENERATE FLATTEN(BagToTuple(Bkt.StockCat));
-- Storing Market Baskets
STORE MarketBaskets into '/user/admin/retail/marketbaskets' using PigStorage (',');