Script:
import findspark
findspark.init('/Users/donghua/spark-2.4.0-bin-hadoop2.7')
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark=SparkSession(SparkContext()).builder.master('local[2]').appName('Handson PySpark Chapter 5').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('debug')
sc.getConf().getAll()
import urllib.request
url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
localfile = '/tmp/kddcup.data_10_percent.gz'
f = urllib.request.urlretrieve(url,localfile)
raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz')
csv = raw_data.map(lambda x: x.split(','))
duration = raw_data.map(lambda x: [int(x[0])])
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(duration)
summary.mean()[0]
summary.count()
metrics = csv.map(lambda x: [x[0],x[4],x[5]])
metrics.take(2)
Statistics.corr(metrics, method="spearman")
Statistics.corr(metrics, method="pearson")
from pyspark.mllib.linalg import Vectors
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 8)
print(Statistics.chiSqTest(visitors_freq))
print(Statistics.chiSqTest(duration.collect()))
spark.stop()
Output (Jupyter):
import findspark
findspark.init('/Users/donghua/spark-2.4.0-bin-hadoop2.7')
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark=SparkSession(SparkContext()).builder.master('local[2]').appName('Handson PySpark Chapter 5').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('debug')
sc.getConf().getAll()
[('spark.sql.warehouse.dir', '/user/hive/warehouse'),
('spark.rdd.compress', 'True'),
('spark.app.id', 'local-1553755489097'),
('spark.driver.port', '51208'),
('spark.serializer.objectStreamReset', '100'),
('spark.executor.id', 'driver'),
('spark.submit.deployMode', 'client'),
('spark.app.name', 'Handson PySpark Chapter 5'),
('spark.driver.host', '192.168.31.177'),
('spark.ui.showConsoleProgress', 'true'),
('spark.master', 'local[2]')]
import urllib.request
url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
localfile = '/tmp/kddcup.data_10_percent.gz'
f = urllib.request.urlretrieve(url,localfile)
raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz')
csv = raw_data.map(lambda x: x.split(','))
duration = raw_data.map(lambda x: [int(x[0])])
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(duration)
summary.mean()[0]
0.06611054995637812
summary.count()
494021
metrics = csv.map(lambda x: [x[0],x[4],x[5]])
metrics.take(2)
[['0', '181', '5450'], ['0', '239', '486']]
Statistics.corr(metrics, method="spearman")
array([[ 1. , 0.01419628, 0.29918926],
[ 0.01419628, 1. , -0.16793059],
[ 0.29918926, -0.16793059, 1. ]])
Statistics.corr(metrics, method="pearson")
array([[ 1.00000000e+00, 4.25823027e-03, 5.43953448e-03],
[ 4.25823027e-03, 1.00000000e+00, -1.59677215e-06],
[ 5.43953448e-03, -1.59677215e-06, 1.00000000e+00]])
from pyspark.mllib.linalg import Vectors
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))
Chi squared test summary:
method: pearson
degrees of freedom = 4
statistic = 0.5852136752136753
pValue = 0.9646925263439344
No presumption against null hypothesis: observed follows the same distribution as expected..
visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 8)
print(Statistics.chiSqTest(visitors_freq))
C
hi squared test summary:
method: pearson
degrees of freedom = 4
statistic = 22.469462151394424
pValue = 1.6158934330234853E-4
Very strong presumption against null hypothesis: observed follows the same distribution as expected..
print(Statistics.chiSqTest(duration.collect()))
Chi squared test summary:
method: pearson
degrees of freedom = 494020
statistic = 2041502.1434188513
pValue = 0.0
Very strong presumption against null hypothesis: observed follows the same distribution as expected..
spark.stop()