[DE 프로젝트: 실시간 빅데이터 처리 'SIXAT'] 4. 스파크(Spark) MLlib

베이스 라인

  • 거리를 나타내는 컬럼만 사용하여 선형 회귀 모델을 만들어본다.
1
2
3
4
5
6
7
8
9
10
# 스파크 세션 생성
from pyspark.sql import SparkSession


# 인스턴스 생성(Max Memory 지정: Out of Memory 방지)
MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()
  • 인스턴스 생성 시 부터 맥스 메모리(Max Memory)를 지정해준다.
    • Out of Memory 증상을 미연에 방지한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 파일 불러오기
trips_df = spark.read.csv("/Users/6mini/trip/*", inferSchema=True, header=True)


# 스키마 확인
trips_df.printSchema()
'''
root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
'''
  • 만약 모든 컬럼의 타입으로 string이 나온다면, 폴더 내 파일이 이상할 수 있으니 확인할 필요가 있다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# SQL을 사용하기 위해 TempView에 담는다.
trips_df.createOrReplaceTempView("trips")


# 분석 때 진행했던 이상치 제거를 적용한다.
query = """
SELECT 
    trip_distance,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 5
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""


# 쿼리를 적용시킨 데이터를 만들어 TempView에 담는다.
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")


data_df.show()
'''
+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          2.1|        11.8|
|          0.2|         4.3|
|         14.7|       51.95|
|         10.6|       36.35|
|         4.94|       24.36|
|          1.6|       14.15|
|          4.1|        17.3|
|          5.7|        21.8|
|          9.1|        28.8|
|          2.7|       18.95|
|         6.11|        24.3|
|         1.21|       10.79|
|          7.4|       33.92|
|         1.01|        10.3|
|         0.73|       12.09|
|         1.17|       12.36|
|         0.78|        9.96|
|         1.66|        12.3|
|         0.93|         9.3|
|         1.16|       11.84|
+-------------+------------+
'''


# 통계 확인
data_df.describe().show()
'''
+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          13326131|          13326131|
|   mean| 2.887422840883046| 17.99071382100484|
| stddev|3.8336178303381163|13.011693630515671|
|    min|              0.01|              0.01|
|    max|             475.5|            4973.3|
+-------+------------------+------------------+
'''
  • 이상치 제거를 했는데도 택시비 600만원 정도가 존재한다.
    • 일단 넘어간다.
1
2
3
4
5
6
7
8
9
10
# 트레인 테스트 스플릿
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=6)


print(train_df.count())
print(test_df.count())
'''
10659118
2667013
'''
  • 빅데이터를 처음 다뤄보는데 지금까지 보던 데이터양과 비교하니 정말 많아보인다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# 벡터 어셈블러로 트레인이 가능한 상태로 변환한다.
from pyspark.ml.feature import VectorAssembler


vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")


vtrain_df = vassembler.transform(train_df)


vtrain_df.show()
'''
+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
'''


# 리그레션 모델 생성
from pyspark.ml.regression import LinearRegression


# 간단한 베이스 라인 코드
lr = LinearRegression(
    maxIter=50,
    labelCol="total_amount",
    featuresCol="features"
)


model = lr.fit(vtrain_df)


vtest_df = vassembler.transform(test_df)


# 예측
prediction = model.transform(vtest_df)


prediction.show()
'''
+-------------+------------+--------+----------------+
|trip_distance|total_amount|features|      prediction|
+-------------+------------+--------+----------------+
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.3|  [0.01]|9.44834341207817|
|         0.01|         3.8|  [0.01]|9.44834341207817|
|         0.01|         3.8|  [0.01]|9.44834341207817|
|         0.01|         3.8|  [0.01]|9.44834341207817|
|         0.01|         3.8|  [0.01]|9.44834341207817|
|         0.01|         3.8|  [0.01]|9.44834341207817|
|         0.01|         3.8|  [0.01]|9.44834341207817|
+-------------+------------+--------+----------------+
'''


# 평가
model.summary.rootMeanSquaredError
'''
3.9391521978481396
'''


model.summary.r2
'''
0.8809194877142736
'''
  • 성능이 생각보다 괜찮다.

실 서비스 테스크

  • 이왕 만든 김에 실 서비스에서 사용하기 위한 작업도 진행한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 실서비스에서 사용하기 위한 테스크
from pyspark.sql.types import DoubleType
distance_list = [1.1, 5.5, 10.5, 30.0]
distance_df = spark.createDataFrame(distance_list, DoubleType()).toDF("trip_distance")


distance_df.show()
'''
+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+
'''


vdistance_df = vassembler.transform(distance_df)


model.transform(vdistance_df).show()

성능 향상

  • 예측에 쓰일 컬럼을 추가하고 전처리하여 성능을 올린다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
query = """
SELECT 
    passenger_count,
    PULocationID as pickup_location_id,
    DOLocationID as dropoff_location_id,
    trip_distance,
    HOUR(tpep_pickup_datetime) as pickup_time,
    DATE_FORMAT(TO_DATE(tpep_pickup_datetime), 'EEEE') AS day_of_week,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 5
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")


data_df.printSchema()
'''
root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)
'''


train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=6)


# 나뉘어진 데이터 프레임을 저장해둔다.
data_dir = "/Users/6mini/spark/taxi"


# 파퀫 형태로 저장
# 압축률이 좋고 디스크 io가 적다
# 컬럼 기반 포맷이다.
train_df.write.format("parquet").save(f"{data_dir}/train/")
test_df.write.format("parquet").save(f"{data_dir}/test/")


# 파퀫 형태로 불러오기
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

스테이지 파이프 라인

  • 프리프로세싱을 위한 파이프 라인을 생성하여 전처리한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# 스트링 값을 숫자값으로 바꾸어 원핫인코딩을 진행한다.
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# 카테고리 피쳐 설정
cat_feats = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

# 스테이지를 담는 배열
stages = []


for c in cat_feats:
    cat_indexer = StringIndexer(inputCol=c, outputCol= c + "_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c + "_onehot"])
    stages += [cat_indexer, onehot_encoder]

stages
'''
[StringIndexer_ff1b1c4b32df,
 OneHotEncoder_4ea9b3272ed6,
 StringIndexer_65b49d3966df,
 OneHotEncoder_2e03a29c5802,
 StringIndexer_b6267b01a6ed,
 OneHotEncoder_5de1f0401482]
'''


# 숫자형 전처리: 벡터 어셈블러, 스탠다드 스칼라
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feats = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol= n + "_vecotr")
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol= n + "_scaled")
    stages += [num_assembler, num_scaler]

stages
'''
[StringIndexer_ff1b1c4b32df,
 OneHotEncoder_4ea9b3272ed6,
 StringIndexer_65b49d3966df,
 OneHotEncoder_2e03a29c5802,
 StringIndexer_b6267b01a6ed,
 OneHotEncoder_5de1f0401482,
 VectorAssembler_a56b59cc4ac2,
 StandardScaler_898a128ab6df,
 VectorAssembler_ab4e638f04bc,
 StandardScaler_73b16bd04318,
 VectorAssembler_7063855370da,
 StandardScaler_c4e3960fc997]
'''


# 두가지를 프리프로세싱을 했는데 합치는 과정을 진행한다.
# 벡터 어셈블러를 이용하여 가능하다.
assembler_inputs = [c + "_onehot" for c in cat_feats] + [n + "_scaled" for n in num_feats]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]

stages
'''
[StringIndexer_ff1b1c4b32df,
 OneHotEncoder_4ea9b3272ed6,
 StringIndexer_65b49d3966df,
 OneHotEncoder_2e03a29c5802,
 StringIndexer_b6267b01a6ed,
 OneHotEncoder_5de1f0401482,
 VectorAssembler_a56b59cc4ac2,
 StandardScaler_898a128ab6df,
 VectorAssembler_ab4e638f04bc,
 StandardScaler_73b16bd04318,
 VectorAssembler_7063855370da,
 StandardScaler_c4e3960fc997,
 VectorAssembler_37481243c479]
'''


# 스테이지로 파이프 라인 생성
from pyspark.ml import Pipeline

transform_stages = stages
pipeline = Pipeline(stages=transform_stages)
fitted_transformer = pipeline.fit(train_df)


# 적용
vtrain_df = fitted_transformer.transform(train_df)

모델링

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector"
)


vtrain_df.printSchema()
'''
root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- pickup_location_id_idx: double (nullable = false)
 |-- pickup_location_id_onehot: vector (nullable = true)
 |-- dropoff_location_id_idx: double (nullable = false)
 |-- dropoff_location_id_onehot: vector (nullable = true)
 |-- day_of_week_idx: double (nullable = false)
 |-- day_of_week_onehot: vector (nullable = true)
 |-- passenger_count_vecotr: vector (nullable = true)
 |-- passenger_count_scaled: vector (nullable = true)
 |-- trip_distance_vecotr: vector (nullable = true)
 |-- trip_distance_scaled: vector (nullable = true)
 |-- pickup_time_vecotr: vector (nullable = true)
 |-- pickup_time_scaled: vector (nullable = true)
 |-- feature_vector: vector (nullable = true)
'''


model = lr.fit(vtrain_df)


vtest_df = fitted_transformer.transform(test_df)


predictions = model.transform(vtest_df)


# 캐싱을하여 나중에 쓰기 쉽게 만든다.
predictions.cache()


predictions.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()
'''
+-------------+-----------+------------+------------------+
|trip_distance|day_of_week|total_amount|        prediction|
+-------------+-----------+------------+------------------+
|         0.01|     Sunday|       22.25|  12.3875521857505|
|          0.1|     Sunday|         3.3|10.744178027113051|
|          6.1|     Monday|        26.3|28.652704157860498|
|          7.1|     Monday|       35.76|31.174097556214498|
|          0.9|     Sunday|         8.3|  7.66950512061783|
|          2.7|     Sunday|        16.6|15.133976597996543|
|          1.0|   Saturday|         7.3| 9.111650395437803|
|          1.7|   Saturday|       14.75|12.508165587852046|
|          0.6|     Friday|         5.8| 9.667477937462271|
|          0.9|     Monday|        11.0|10.668972172418805|
|          1.1|     Sunday|         9.3|11.048364111576014|
|          0.2|     Sunday|         0.3| 7.720810167026536|
|          2.8|     Sunday|        12.3|15.966585384151795|
|          5.4|   Saturday|        24.3|25.982388970567552|
|          1.5|     Friday|       12.95|13.423933951257354|
|          1.3|   Saturday|        14.0| 12.11995461024021|
|          0.8|     Sunday|       12.05|10.062646289454223|
|          2.0|   Saturday|       14.75|14.090220167272687|
|          0.8|     Monday|         8.8| 10.36923224933844|
|          0.3|     Friday|         8.8|  8.95173931183552|
+-------------+-----------+------------+------------------+
'''


model.summary.rootMeanSquaredError
'''
3.877648659368804
'''


model.summary.r2
'''
0.8922402466214665
'''
  • 성능이 조금 올랐다.

하이퍼 파라미터 튜닝

  • 성능을 조금 더 올리기 위하여 하이퍼 파라미터 튜닝을 진행한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
data_dir = "/Users/6mini/spark/taxi"
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")


# 하이퍼 파라미터에 쓸 데이터 프레임 생성
toy_df = train_df.sample(False, 0.1, seed=6)


from pyspark.ml.feature import OneHotEncoder, StringIndexer

cat_feats = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

stages = []

for c in cat_feats:
    cat_indexer = StringIndexer(inputCol=c, outputCol= c + "_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c + "_onehot"])
    stages += [cat_indexer, onehot_encoder]


from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feats = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol= n + "_vecotr")
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol= n + "_scaled")
    stages += [num_assembler, num_scaler]


assembler_inputs = [c + "_onehot" for c in cat_feats] + [n + "_scaled" for n in num_feats]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]


from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(
    maxIter=30,
    solver="normal",
    labelCol='total_amount',
    featuresCol='feature_vector'
)

# cv 파이프 라인 생성
cv_stages = stages + [lr]


cv_pipeline = Pipeline(stages=cv_stages)


param_grid = ParamGridBuilder()\
                .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()


cross_val = CrossValidator(estimator=cv_pipeline,
                           estimatorParamMaps=param_grid,
                           evaluator=RegressionEvaluator(labelCol="total_amount"),
                           numFolds=5)


cv_model = cross_val.fit(toy_df)


alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

모델 학습

  • 하이퍼 파라미터 튜닝 결과로 나온 값으로 모델링을 진행한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
transform_stages = stages
pipeline = Pipeline(stages=transform_stages)
fitted_transformer = pipeline.fit(train_df)


vtrain_df = fitted_transformer.transform(train_df)



lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector",
    elasticNetParam=alpha,
    regParam=reg_param,
)


model = lr.fit(vtrain_df)


vtest_df = fitted_transformer.transform(test_df)


predictions = model.transform(vtest_df)


predictions.cache()


predictions.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()


model.summary.rootMeanSquaredError
'''
3.8848282133166245
'''


model.summary.r2
'''
0.8918408379788968
'''
  • 오히려 성능이 약간 떨어졌다.

모델 저장 및 로드

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
model_dir = "/Users/6mini/spark/taxi/model"
model.save(model_dir)


from pyspark.ml.regression import LinearRegressionModel


lr_model = LinearRegressionModel().load(model_dir)


predictions = lr_model.transform(vtest_df)


predictions.show()
'''
+---------------+------------------+-------------------+-------------+-----------+-----------+------------+----------------------+-------------------------+-----------------------+--------------------------+---------------+------------------+----------------------+----------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+
|passenger_count|pickup_location_id|dropoff_location_id|trip_distance|pickup_time|day_of_week|total_amount|pickup_location_id_idx|pickup_location_id_onehot|dropoff_location_id_idx|dropoff_location_id_onehot|day_of_week_idx|day_of_week_onehot|passenger_count_vecotr|passenger_count_scaled|trip_distance_vecotr|trip_distance_scaled|pickup_time_vecotr|  pickup_time_scaled|      feature_vector|        prediction|
+---------------+------------------+-------------------+-------------+-----------+-----------+------------+----------------------+-------------------------+-----------------------+--------------------------+---------------+------------------+----------------------+----------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+
|              0|                10|                 10|         0.01|          8|     Sunday|       22.25|                  86.0|         (253,[86],[1.0])|                   97.0|          (259,[97],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|              [0.01]|[0.00289700388219...|             [8.0]| [1.767444038036644]|(522,[86,350,518,...| 12.40062399741981|
|              0|                10|                215|          0.1|         11|     Sunday|         3.3|                  86.0|         (253,[86],[1.0])|                  154.0|         (259,[154],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [0.1]|[0.02897003882196...|            [11.0]|[2.4302355523003856]|(522,[86,407,518,...|11.179916642407761|
|              0|                13|                233|          6.1|         13|     Monday|        26.3|                  49.0|         (253,[49],[1.0])|                   24.0|          (259,[24],[1.0])|            5.0|     (7,[5],[1.0])|                 [0.0]|                 [0.0]|               [6.1]|[1.7671723681399851]|            [13.0]|[2.8720965618095464]|(522,[49,277,517,...|28.513128986649363|
|              0|                41|                 68|          7.1|         17|     Monday|       35.76|                  39.0|         (253,[39],[1.0])|                   15.0|          (259,[15],[1.0])|            5.0|     (7,[5],[1.0])|                 [0.0]|                 [0.0]|               [7.1]|[2.0568727563596547]|            [17.0]|[3.7558185808278686]|(522,[39,268,517,...|30.882927780540072|
|              0|                41|                 74|          0.9|         17|     Sunday|         8.3|                  39.0|         (253,[39],[1.0])|                   29.0|          (259,[29],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [0.9]|[0.26073034939770...|            [17.0]|[3.7558185808278686]|(522,[39,282,518,...| 7.868949092930335|
|              0|                41|                143|          2.7|          8|     Sunday|        16.6|                  39.0|         (253,[39],[1.0])|                   20.0|          (259,[20],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [2.7]|[0.7821910481931083]|             [8.0]| [1.767444038036644]|(522,[39,273,518,...|15.094931340419489|
|              0|                41|                151|          1.0|         16|   Saturday|         7.3|                  39.0|         (253,[39],[1.0])|                   28.0|          (259,[28],[1.0])|            3.0|     (7,[3],[1.0])|                 [0.0]|                 [0.0]|               [1.0]|[0.2897003882196697]|            [16.0]| [3.534888076073288]|(522,[39,281,515,...| 9.297192995371958|
|              0|                41|                239|          1.7|         19|   Saturday|       14.75|                  39.0|         (253,[39],[1.0])|                    3.0|           (259,[3],[1.0])|            3.0|     (7,[3],[1.0])|                 [0.0]|                 [0.0]|               [1.7]|[0.4924906599734385]|            [19.0]|  [4.19767959033703]|(522,[39,256,515,...|12.505205755067445|
|              0|                43|                 24|          0.6|         18|     Friday|         5.8|                  20.0|         (253,[20],[1.0])|                   46.0|          (259,[46],[1.0])|            0.0|     (7,[0],[1.0])|                 [0.0]|                 [0.0]|               [0.6]|[0.17382023293180...|            [18.0]| [3.976749085582449]|(522,[20,299,512,...| 9.807590972898764|
|              0|                43|                142|          0.9|          8|     Monday|        11.0|                  20.0|         (253,[20],[1.0])|                    7.0|           (259,[7],[1.0])|            5.0|     (7,[5],[1.0])|                 [0.0]|                 [0.0]|               [0.9]|[0.26073034939770...|             [8.0]| [1.767444038036644]|(522,[20,260,517,...|10.651270340211198|
|              0|                43|                142|          1.1|         12|     Sunday|         9.3|                  20.0|         (253,[20],[1.0])|                    7.0|           (259,[7],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [1.1]|[0.3186704270416367]|            [12.0]| [2.651166057054966]|(522,[20,260,518,...|11.014278848833877|
|              0|                43|                151|          0.2|         21|     Sunday|         0.3|                  20.0|         (253,[20],[1.0])|                   28.0|          (259,[28],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [0.2]|[0.05794007764393...|            [21.0]| [4.639540599846191]|(522,[20,281,518,...| 7.862427690725334|
|              0|                43|                166|          2.8|         12|     Sunday|        12.3|                  20.0|         (253,[20],[1.0])|                   37.0|          (259,[37],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [2.8]|[0.8111610870150752]|            [12.0]| [2.651166057054966]|(522,[20,290,518,...|16.034126485835703|
|              0|                43|                231|          5.4|         13|   Saturday|        24.3|                  20.0|         (253,[20],[1.0])|                   34.0|          (259,[34],[1.0])|            3.0|     (7,[3],[1.0])|                 [0.0]|                 [0.0]|               [5.4]|[1.5643820963862165]|            [13.0]|[2.8720965618095464]|(522,[20,287,515,...|25.834845317536548|
|              0|                43|                238|          1.5|         17|     Friday|       12.95|                  20.0|         (253,[20],[1.0])|                    4.0|           (259,[4],[1.0])|            0.0|     (7,[0],[1.0])|                 [0.0]|                 [0.0]|               [1.5]|[0.43455058232950...|            [17.0]|[3.7558185808278686]|(522,[20,257,512,...|13.318822345381877|
|              0|                43|                262|          1.3|         13|   Saturday|        14.0|                  20.0|         (253,[20],[1.0])|                   17.0|          (259,[17],[1.0])|            3.0|     (7,[3],[1.0])|                 [0.0]|                 [0.0]|               [1.3]|[0.3766105046855706]|            [13.0]|[2.8720965618095464]|(522,[20,270,515,...|12.070487123616296|
|              0|                43|                263|          0.8|         16|     Sunday|       12.05|                  20.0|         (253,[20],[1.0])|                    9.0|           (259,[9],[1.0])|            6.0|     (7,[6],[1.0])|                 [0.0]|                 [0.0]|               [0.8]|[0.23176031057573...|            [16.0]| [3.534888076073288]|(522,[20,262,518,...|10.115757256158687|
|              0|                45|                249|          2.0|         11|   Saturday|       14.75|                  62.0|         (253,[62],[1.0])|                   27.0|          (259,[27],[1.0])|            3.0|     (7,[3],[1.0])|                 [0.0]|                 [0.0]|               [2.0]|[0.5794007764393394]|            [11.0]|[2.4302355523003856]|(522,[62,280,515,...|14.111216356455753|
|              0|                48|                 43|          0.8|         16|     Monday|         8.8|                  12.0|         (253,[12],[1.0])|                   26.0|          (259,[26],[1.0])|            5.0|     (7,[5],[1.0])|                 [0.0]|                 [0.0]|               [0.8]|[0.23176031057573...|            [16.0]| [3.534888076073288]|(522,[12,279,517,...|10.439980432510467|
|              0|                48|                 50|          0.3|         17|     Friday|         8.8|                  12.0|         (253,[12],[1.0])|                   36.0|          (259,[36],[1.0])|            0.0|     (7,[0],[1.0])|                 [0.0]|                 [0.0]|               [0.3]|[0.08691011646590...|            [17.0]|[3.7558185808278686]|(522,[12,289,512,...| 8.993055728266917|
+---------------+------------------+-------------------+-------------+-----------+-----------+------------+----------------------+-------------------------+-----------------------+--------------------------+---------------+------------------+----------------------+----------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+
'''
0%