-
Notifications
You must be signed in to change notification settings - Fork 909
/
Copy pathpyspark-add-new-column.py
69 lines (50 loc) · 1.86 KB
/
pyspark-add-new-column.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [('James','Smith','M',3000),
('Anna','Rose','F',4100),
('Robert','Williams','M',6200),
]
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()
if 'salary1' not in df.columns:
print("aa")
# Add new constanct column
from pyspark.sql.functions import lit
df.withColumn("bonus_percent", lit(0.3)) \
.show()
#Add column from existing column
df.withColumn("bonus_amount", df.salary*0.3) \
.show()
#Add column by concatinating existing columns
from pyspark.sql.functions import concat_ws
df.withColumn("name", concat_ws(",","firstname",'lastname')) \
.show()
#Add current date
from pyspark.sql.functions import current_date
df.withColumn("current_date", current_date()) \
.show()
from pyspark.sql.functions import when
df.withColumn("grade", \
when((df.salary < 4000), lit("A")) \
.when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \
.otherwise(lit("C")) \
).show()
# Add column using select
df.select("firstname","salary", lit(0.3).alias("bonus")).show()
df.select("firstname","salary", lit(df.salary * 0.3).alias("bonus_amount")).show()
df.select("firstname","salary", current_date().alias("today_date")).show()
#Add columns using SQL
df.createOrReplaceTempView("PER")
spark.sql("select firstname,salary, '0.3' as bonus from PER").show()
spark.sql("select firstname,salary, salary * 0.3 as bonus_amount from PER").show()
spark.sql("select firstname,salary, current_date() as today_date from PER").show()
spark.sql("select firstname,salary, " +
"case salary when salary < 4000 then 'A' "+
"else 'B' END as grade from PER").show()