df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("./data/retail-data/by-day/2010-12-01.csv")


df.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows


df.groupBy("StockCode").count().show(2)
df.groupBy("CustomerID").count().show(2)

+---------+-----+
|StockCode|count|
+---------+-----+
|    22728|    4|
|    21889|    5|
+---------+-----+
only showing top 2 rows

+----------+-----+
|CustomerID|count|
+----------+-----+
|   15311.0|   36|
|   16539.0|   21|
+----------+-----+
only showing top 2 rows


df.createOrReplaceTempView("dfTable") # 데이터프레임에 SQL을 적용시킬 수 있는 객체를 별도 생성


df.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows


from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]


from pyspark.sql.functions import col
df.where(col("InvoiceNo")!=536365)\
.select("InvoiceNo", "Description")\
.show(3, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
+---------+-----------------------------+
only showing top 3 rows


df.where("InvoiceNo = 536366").show(3, False)

+---------+---------+-------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description              |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-------------------------+--------+-------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK   |6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT|6       |2010-12-01 08:28:00|1.85     |17850.0   |United Kingdom|
+---------+---------+-------------------------+--------+-------------------+---------+----------+--------------+


df.where("InvoiceNo <> 536366").show(3, False)

+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN               |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER    |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
only showing top 3 rows


from pyspark.sql.functions import instr

DOTCodeFilter = col("StockCode")=="DOT"
priceFilter = col("UnitPrice")> 600
descripFilter = instr(col("Description"), "POSTAGE")>=1 # POSTAGE : 우편요금

df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.show(2)

df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice", "isExpensive").show(2)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+


from pyspark.sql.functions import expr

df.withColumn("isExpensive",expr("NOT UnitPrice <=250"))\
.where("isExpensive")\
.select("Description", "UnitPrice").show(2)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+


from pyspark.sql.functions import expr, pow

fabricatedQuantity = pow(col("Quantity")*col("UnitPrice"),2)+5


# 수치형: 곱셈, 덧셈, 뺼셈이 가능하다.
df.select(expr("CustomerID"), fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerID|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows


df.selectExpr("CustomerID",
             "(POWER((Quantity*UnitPrice), 2.0)+5) as realQuantity"
             ).show(2)

+----------+------------------+
|CustomerID|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows


# 소수점 첫째자리에서 반올림하는 예제
from pyspark.sql.functions import lit, round, bround
df.select(round(col("UnitPrice"),1).alias("rounded"), col("UnitPrice")).show(2)

+-------+---------+
|rounded|UnitPrice|
+-------+---------+
|    2.6|     2.55|
|    3.4|     3.39|
+-------+---------+
only showing top 2 rows


from pyspark.sql.functions import lit, round, bround
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows


from pyspark.sql.functions import  corr
df.stat.corr("Quantity", "UnitPrice")

-0.04112314436835551


df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+-------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|        InvoiceDate|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+-------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|               3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128|               null| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|               null|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|2010-12-01 08:26:00|               0.0|           12431.0|     Australia|
|    max|          C536548|              POST|ZINC WILLIE WINKI...|               600|2010-12-01 17:35:00|            607.49|           18229.0|United Kingdom|
+-------+-----------------+------------------+--------------------+------------------+-------------------+------------------+------------------+--------------+


from pyspark.sql.functions import count, mean, stddev_pop, min,max


colName ="UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError) #2.51

[2.51]


df.stat.crosstab("StockCode", "Quantity").show(2)

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|             21327|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  2|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
only showing top 2 rows


df.stat.freqItems(["StockCode", "Quantity"]).show(2, False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|StockCode_freqItems                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |Quantity_freqItems                                                                                                                                                                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[90214E, 20728, 20755, 21703, 22113, 22524, 22041, 72803A, 72798C, 90181B, 21756, 22694, 90206C, 20970, 21624, 90209C, 84744, 82494L, 22952, 20682, 22583, 21705, 20679, 22220, 90177E, 90214A, 22448, 90214S, 22121, 22802, 84970L, 72818, 90192, 90200C, 22910, 21380, 90211A, 21137, 35271S, 84926A, 20765, 22384, 21524, 22165, 22366, 21221, 21704, 22519, 85035C, 21967, 22114, 22909, 22900, 22447, 21577, 21877, 20726, 85034A, DOT, 84658, 21472, 22804, 22222, 72802C, 21739, 22467, 90214H, 22785, 22446, 22197, 20665, 21733, 22731, 21709, 22086, 40001, 85123A]|[200, 128, 23, 32, 50, 600, 8, 17, 80, -1, -10, 11, 56, 47, 20, -7, 2, 5, 480, -4, 14, 432, 100, 64, 40, 13, 4, -5, 22, 16, -2, 7, 70, 384, 25, 34, 10, 1, 288, 216, 28, 252, 19, 120, 192, 60, 96, 72, 144, 36, 27, 9, 18, 48, 21, 12, 3, -6, -24, 30, 15, 33, 6, 24, -12, -3]|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows


from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
|Set 7 Babushka Ne...|
|Glass Star Froste...|
|Hand Warmer Union...|
|Hand Warmer Red P...|
|Assorted Colour B...|
|Poppy's Playhouse...|
|Poppy's Playhouse...|
|Feltcraft Princes...|
|Ivory Knitted Mug...|
|Box Of 6 Assorted...|
|Box Of Vintage Ji...|
|Box Of Vintage Al...|
|Home Building Blo...|
|Love Building Blo...|
|Recipe Box With M...|
+--------------------+
only showing top 20 rows


from pyspark.sql.functions import lower, upper
df.select(col("Description"), lower(col("Description")),\
upper(lower(col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows


from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

df.select(
ltrim(lit("    HELLO   ")).alias("ltrim"),
rtrim(lit("    HELLO   ")).alias("rtrim"),
trim(lit("    HELLO   ")).alias("ltrim"),
lpad(lit("HELLO"),3," ").alias("lp"),
rpad(lit("HELLO"),10," ").alias("rp")).show(2)

+--------+---------+-----+---+----------+
|   ltrim|    ltrim|ltrim| lp|        rp|
+--------+---------+-----+---+----------+
|HELLO   |    HELLO|HELLO|HEL|HELLO     |
|HELLO   |    HELLO|HELLO|HEL|HELLO     |
+--------+---------+-----+---+----------+
only showing top 2 rows


# regexp_replace 힘수를 사용해 description 컬럼을 COLOR 로 치환해보겠습니다.
from pyspark.sql.functions import regexp_replace
regex_string="BLACK|WHITE|RED|GREEN|BLUE"
df.select(
regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),\
    col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows


from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEFT", "1337"), \
          col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEFT, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows


from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
regexp_extract(col("Description"), extract_str,1).alias("color_clean"),
col("Description")
).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows


from pyspark.sql.functions import instr

containsBlack = instr(col("Description"), "BLACK")>=1
containsWhite = instr(col("Description"), "WHITE")>=1

df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor").select("hasSimpleColor")\
.show(45, False)

+--------------+
|hasSimpleColor|
+--------------+
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
|true          |
+--------------+
only showing top 45 rows


from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
         .cast("boolean").alias("is_"+color_string)

selectColumns = [color_locator(df.Description,c) for c in simpleColors]
selectColumns.append(expr("*")) # column 타입이어야한다.

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
.select("Description").show(2, False)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-ebcf13d83100> in <module>
      7 selectColumns.append(expr("*")) # column 타입이어야한다.
      8 
----> 9 df.select(*selectedColumns).where(expr("is_white OR is_red"))\
     10 .select("Description").show(2, False)

NameError: name 'selectedColumns' is not defined


from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")


dateDF.show(2)

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2021-07-04|2021-07-04 11:20:...|
|  1|2021-07-04|2021-07-04 11:20:...|
+---+----------+--------------------+
only showing top 2 rows


from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"),5), date_add(col("today"),5))\
.show(3)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2021-06-29|        2021-07-09|
|        2021-06-29|        2021-07-09|
|        2021-06-29|        2021-07-09|
+------------------+------------------+
only showing top 3 rows


from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"),7))\
.select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
.select(months_between(col("start"), col("end"))).show(2)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
|                    -16.67741935|
+--------------------------------+
only showing top 2 rows


from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-02-01"))\
.select(to_date(col("date"))).show(3)

+-------------+
|to_date(date)|
+-------------+
|   2017-02-01|
|   2017-02-01|
|   2017-02-01|
+-------------+
only showing top 3 rows


from pyspark.sql.functions import to_date

dateFormat = "yyyy-dd-MM"

cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))

cleanDateDF.createOrReplaceTempView("dateTable2")

cleanDateDF.show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+


from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

+------------------------------+
|to_timestamp(date, yyyy-dd-MM)|
+------------------------------+
|           2017-11-12 00:00:00|
+------------------------------+


cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+


cleanDateDF.filter(col("date2") > "'2017-12-12'").show() # 안되는디여

+----+-----+
|date|date2|
+----+-----+
+----+-----+


from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerID"))).show(3,False)

+----------------------------------+
|coalesce(Description, CustomerID) |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|CREAM CUPID HEARTS COAT HANGER    |
+----------------------------------+
only showing top 3 rows


df.na.drop()
df.na.drop("any")
df.na.drop("all")
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]


df.na.fill("all", subset=["StockCode", "InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]


fill_cols_vals = {"StockCode":5, "Description":"No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]


df.na.replace([""],["UNKONW"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]


from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")
complexDF.show(2, False)

+--------------------------------------------+
|complex                                     |
+--------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365}|
|{WHITE METAL LANTERN, 536365}               |
+--------------------------------------------+
only showing top 2 rows


complexDF.select("complex.Description").show(2,False)
complexDF.select("complex.InvoiceNo").show(2,False)
complexDF.select(col("complex").getField("Description")).show(1,False)
complexDF.select("complex.*").show(3,False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 2 rows

+---------+
|InvoiceNo|
+---------+
|536365   |
|536365   |
+---------+
only showing top 2 rows

+----------------------------------+
|complex.Description               |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
+----------------------------------+
only showing top 1 row

+----------------------------------+---------+
|Description                       |InvoiceNo|
+----------------------------------+---------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |
|WHITE METAL LANTERN               |536365   |
|CREAM CUPID HEARTS COAT HANGER    |536365   |
+----------------------------------+---------+
only showing top 3 rows


from pyspark.sql.functions import split

df.select(col("Description")).show(2,False)
df.select(split(col("Description"), " ")).show(2,False)
df.select(split(col("Description")," ").alias("array_col"))\
.selectExpr("array_col[0]").show(2,False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 2 rows

+----------------------------------------+
|split(Description,  , -1)               |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                 |
+----------------------------------------+
only showing top 2 rows

+------------+
|array_col[0]|
+------------+
|WHITE       |
|WHITE       |
+------------+
only showing top 2 rows


from pyspark.sql.functions import size
df.select(size(split(col("Description")," "))).show(2,False)

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|5                              |
|3                              |
+-------------------------------+
only showing top 2 rows


from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description")," "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows


from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description")," "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo","exploded").show(3,False)

+----------------------------------+---------+--------+
|Description                       |InvoiceNo|exploded|
+----------------------------------+---------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |WHITE   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HANGING |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HEART   |
+----------------------------------+---------+--------+
only showing top 3 rows


df.show(2, False)

+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN               |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows


from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(3,False)

+----------------------------------------------+
|complex_map                                   |
+----------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365}|
|{WHITE METAL LANTERN -> 536365}               |
|{CREAM CUPID HEARTS COAT HANGER -> 536365}    |
+----------------------------------------------+
only showing top 3 rows


df.select(create_map(col("Description"),col("InvoiceNo")).alias("complex_map")).show(1,False)

+----------------------------------------------+
|complex_map                                   |
+----------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365}|
+----------------------------------------------+
only showing top 1 row


df.select(create_map(col("Description"),col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2,False)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|null                            |
|536365                          |
+--------------------------------+
only showing top 2 rows


df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2,False)

+----------------------------------+------+
|key                               |value |
+----------------------------------+------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365|
|WHITE METAL LANTERN               |536365|
+----------------------------------+------+
only showing top 2 rows


jsonDF= spark.range(1).selectExpr("""
'{"myJSONKey": {"myJSONValue" : [1, 2, 3]}}' as jsonString
""")


from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(
 get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
 json_tuple(col("jsonString"), "myJSONKey")).show(1,False)

+------+-----------------------+
|column|c0                     |
+------+-----------------------+
|2     |{"myJSONValue":[1,2,3]}|
+------+-----------------------+


from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")))

DataFrame[to_json(myStruct): string]


from pyspark.sql.functions import from_json
from pyspark.sql.types import *

parseSchema = StructType((
StructField("InvoiceNo", StringType(), True),
StructField("Description", StringType(),True)))


df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2,False)

+--------------------------------------------+-------------------------------------------------------------------------+
|from_json(newJSON)                          |newJSON                                                                  |
+--------------------------------------------+-------------------------------------------------------------------------+
|{536365, WHITE HANGING HEART T-LIGHT HOLDER}|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"}|
|{536365, WHITE METAL LANTERN}               |{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}               |
+--------------------------------------------+-------------------------------------------------------------------------+
only showing top 2 rows


udfExampleDF = spark.range(5).toDF("num")

udfExampleDF.select(col("num")).show(5)


def power3(double_value):
    return double_value **3
power3(2.0)

# udf 실행해보자. 먼저 DataFrame에 사용할 수 있게 함수를 등록한다.
from pyspark.sql.functions import udf

power3udf = udf(power3)
power3udf

from pyspark.sql.functions import col


udfExampleDF.select(col("num")).show(5)
udfExampleDF.select(power3udf(col("num"))).show(5)

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+

+-------------+
|POWER(num, 3)|
+-------------+
|          0.0|
|          1.0|
|          8.0|
|         27.0|
|         64.0|
+-------------+


from pyspark.sql.types import IntegerType, DoubleType

# 파이썬 UDF 등록
spark.udf.register("power3py", power3, IntegerType())

# 파이썬으로 등록된 UDF 사용
udfExampleDF.selectExpr("power3py(num)").show(5)

+-------------+
|power3py(num)|
+-------------+
|            0|
|            1|
|            8|
|           27|
|           64|
+-------------+


from pyspark.sql.types import IntegerType, DoubleType

# 파이썬 UDF 등록
spark.udf.register("power3null", power3, DoubleType())

# 파이썬으로 등록된 UDF 사용
udfExampleDF.selectExpr("power3null(num)").show(5)

+---------------+
|power3null(num)|
+---------------+
|           null|
|           null|
|           null|
|           null|
|           null|
+---------------+

쿠버네티스 인 액션 1장 정리 (0)	2022.01.08
스파크 완벽 가이드 7장 (0)	2021.07.07
nodejs (0)	2021.07.02
스파크 완벽 가이드 5장 (0)	2021.06.27
스파크 완벽가이드 4강 (0)	2021.06.27

killog

티스토리 뷰

스파크 완벽 가이드 6장

다양한 데이터 타입 다루기¶

API는 어디서 찾을끼?¶

데이터 해석¶

스파크 데이터 타입으로 변환하기¶

불리언 데이터 타입 다루기¶

수치 데이터 타입 다루기¶

Working with Strings¶

Regular Expressions¶

Working with Dates and Timestamps¶

Working with Nulls in Data¶

Colaesce¶

ifnull, nullIf, nvl, and nvl2¶

drop¶

fill¶

replace¶

Ordering¶

Working with Complex Types¶

Structs¶

Arrays¶

split¶

Array Length : size¶

array_contains¶

explode¶

Maps¶

Working with JSON¶

User-Defined Functions¶

'독후감' 카테고리의 다른 글

티스토리툴바

« 2025/12 »
일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31