-
Notifications
You must be signed in to change notification settings - Fork 907
/
Copy pathpyspark-withcolumn.py
88 lines (70 loc) · 2.62 KB
/
pyspark-withcolumn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 10:20:19 2020
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [('James','','Smith','1991-04-01','M',3000),
('Michael','Rose','','2000-05-19','M',4000),
('Robert','','Williams','1978-09-05','M',4000),
('Maria','Anne','Jones','1967-12-01','F',4000),
('Jen','Mary','Brown','1980-02-17','F',-1)
]
columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.printSchema()
df.show(truncate=False)
df2 = df.withColumn("salary",col("salary").cast("Integer"))
df2.printSchema()
df2.show(truncate=False)
df3 = df.withColumn("salary",col("salary")*100)
df3.printSchema()
df3.show(truncate=False)
df4 = df.withColumn("CopiedColumn",col("salary")* -1)
df4.printSchema()
df5 = df.withColumn("Country", lit("USA"))
df5.printSchema()
df6 = df.withColumn("Country", lit("USA")) \
.withColumn("anotherColumn",lit("anotherValue"))
df6.printSchema()
df.withColumnRenamed("gender","sex") \
.show(truncate=False)
df4.drop("CopiedColumn") \
.show(truncate=False)
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
(("Michael","Rose",""),"40288","M","4000"), \
(("Robert","","Williams"),"42114","M","4000"), \
(("Maria","Anne","Jones"),"39192","F","4000"), \
(("Jen","Mary","Brown"),"","F","-1") \
]
schemaStruct = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('dob', StringType(), True),
StructField('gender', StringType(), True),
StructField('salary', StringType(), True)
])
df7 = spark.createDataFrame(data=dataStruct, schema = schemaStruct)
df7.printSchema()
df7.show(truncate=False)
"""
columns = ["name","address"]
data = [("Robert, Smith", "1 Main st, Newark, NJ, 92537"), \
("Maria, Garcia","3456 Walnut st, Newark, NJ, 94732")]
dfFromData = spark.createDataFrame(data=data, schema = schema)
newDF = dfFromData.map(f=>{
nameSplit = f.getAs[String](0).split(",")
addSplit = f.getAs[String](1).split(",")
(nameSplit(0),nameSplit(1),addSplit(0),addSplit(1),addSplit(2),addSplit(3))
})
finalDF = newDF.toDF("First Name","Last Name",
"Address Line1","City","State","zipCode")
finalDF.printSchema()
finalDF.show(false)
"""