Script:
import findspark
findspark.init('/Users/donghua/spark-2.4.0-bin-hadoop2.7')
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark=SparkSession(SparkContext()).builder.master('local[2]').appName('Handson PySpark Chapter 5').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('debug')
sc.getConf().getAll()
import urllib.request
url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
localfile = '/tmp/kddcup.data_10_percent.gz'
f = urllib.request.urlretrieve(url,localfile)
raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz')
csv = raw_data.map(lambda x: x.split(','))
duration = raw_data.map(lambda x: [int(x[0])])
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(duration)
summary.mean()[0]
summary.count()
metrics = csv.map(lambda x: [x[0],x[4],x[5]])
metrics.take(2)
Statistics.corr(metrics, method="spearman")
Statistics.corr(metrics, method="pearson")
from pyspark.mllib.linalg import Vectors
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 8)
print(Statistics.chiSqTest(visitors_freq))
print(Statistics.chiSqTest(duration.collect()))
spark.stop()
Output (Jupyter):
import findspark
findspark.init('/Users/donghua/spark-2.4.0-bin-hadoop2.7')
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark=SparkSession(SparkContext()).builder.master('local[2]').appName('Handson PySpark Chapter 5').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('debug')
sc.getConf().getAll()
[('spark.sql.warehouse.dir', '/user/hive/warehouse'), ('spark.rdd.compress', 'True'), ('spark.app.id', 'local-1553755489097'), ('spark.driver.port', '51208'), ('spark.serializer.objectStreamReset', '100'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.app.name', 'Handson PySpark Chapter 5'), ('spark.driver.host', '192.168.31.177'), ('spark.ui.showConsoleProgress', 'true'), ('spark.master', 'local[2]')]
import urllib.request
url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
localfile = '/tmp/kddcup.data_10_percent.gz'
f = urllib.request.urlretrieve(url,localfile)
raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz')
csv = raw_data.map(lambda x: x.split(','))
duration = raw_data.map(lambda x: [int(x[0])])
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(duration)
summary.mean()[0]
0.06611054995637812
summary.count()
494021
metrics = csv.map(lambda x: [x[0],x[4],x[5]])
metrics.take(2)
[['0', '181', '5450'], ['0', '239', '486']]
Statistics.corr(metrics, method="spearman")
array([[ 1. , 0.01419628, 0.29918926], [ 0.01419628, 1. , -0.16793059], [ 0.29918926, -0.16793059, 1. ]])
Statistics.corr(metrics, method="pearson")
array([[ 1.00000000e+00, 4.25823027e-03, 5.43953448e-03], [ 4.25823027e-03, 1.00000000e+00, -1.59677215e-06], [ 5.43953448e-03, -1.59677215e-06, 1.00000000e+00]])
from pyspark.mllib.linalg import Vectors
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))
Chi squared test summary: method: pearson degrees of freedom = 4 statistic = 0.5852136752136753 pValue = 0.9646925263439344 No presumption against null hypothesis: observed follows the same distribution as expected..
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 8)
print(Statistics.chiSqTest(visitors_freq))
C
hi squared test summary: method: pearson degrees of freedom = 4 statistic = 22.469462151394424 pValue = 1.6158934330234853E-4 Very strong presumption against null hypothesis: observed follows the same distribution as expected..
print(Statistics.chiSqTest(duration.collect()))
Chi squared test summary: method: pearson degrees of freedom = 494020 statistic = 2041502.1434188513 pValue = 0.0 Very strong presumption against null hypothesis: observed follows the same distribution as expected..
spark.stop()
Some of the code referenced from hands-pyspark-big-data-analysis-video
No comments:
Post a Comment