forked from andrewgrahampowell/f1-data-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
f1-analysis-sql-script.sql
308 lines (269 loc) · 7.92 KB
/
f1-analysis-sql-script.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
-- Analysing Formula 1 data 1950-2020 using dataset from Kaggle
-- GENERAL QUERIES
SELECT
*
FROM
results;
-- Find driver who's competed in the most races (Alonso)
SELECT
results.driverId,
COUNT(results.RaceId),
drivers.surname,
drivers.forename
FROM
results
JOIN
drivers ON results.driverId = drivers.driverId
GROUP BY driverId , drivers.surname, drivers.forename
ORDER BY COUNT(RaceId) DESC;
-- Investigating Alonso's career
SELECT
RaceID, driverID,
fastestLapTime,
fastestLapSpeed
FROM
results
WHERE
driverId = '4' AND fastestLapSpeed <> 0
ORDER BY fastestLapTime ASC;
SELECT
results.RaceID,
driverID,
fastestLapTime,
fastestLapSpeed,
races.circuitId,
races.date,
circuits.name
FROM
results
JOIN
races ON results.RaceId = races.RaceID
LEFT JOIN
circuits ON races.circuitId = circuits.circuitId
WHERE
driverId = '4' AND fastestLapSpeed <> 0
ORDER BY fastestLapTime ASC;
-- See driver stats ordered by Fastest Lap Time
SELECT
results.RaceID,
results.driverID,
drivers.forename,
drivers.surname,
fastestLapTime,
fastestLapSpeed,
races.circuitId,
races.date,
circuits.name
FROM
results
JOIN
races ON results.RaceId = races.RaceID
JOIN
drivers ON results.driverId = drivers.driverId
LEFT JOIN
circuits ON races.circuitId = circuits.circuitId
WHERE
fastestLapSpeed <> 0
ORDER BY fastestLapTime ASC;
-- This query shows us some unusually fast lap times for Bahrain 2020. Let's investigate further.
-- What are the fastest lap times for Bahrain?
SELECT
results.RaceID,
fastestLapTime,
races.circuitId,
races.date,
circuits.name
FROM
results
JOIN
races ON results.RaceId = races.RaceID
LEFT JOIN
circuits ON races.circuitId = circuits.circuitId
WHERE
fastestLapSpeed <> 0 AND circuits.name = 'Bahrain International Circuit'
ORDER BY fastestLapTime ASC;
-- The results show that after RaceID 1046, the next fastest times are over 30s slower, meaning we have an anomaly.
-- Further investigation shows this race was a one off, reformatted race due to COVID.
-- Therefore, it shall be discounted from this query.
-- See driver stats ordered by Fastest Lap Time excluding RaceID 1046
-- Fastest recorded lap up to 2020 is by Carlos Sainz at Red Bull Ring
SELECT
results.RaceID,
results.driverID,
drivers.forename,
drivers.surname,
fastestLapTime,
fastestLapSpeed,
races.circuitId,
races.date,
circuits.name
FROM
results
JOIN
races ON results.RaceId = races.RaceID
JOIN
drivers ON results.driverId = drivers.driverId
LEFT JOIN
circuits ON races.circuitId = circuits.circuitId
WHERE
fastestLapSpeed <> 0 AND results.RaceID != '1046'
ORDER BY fastestLapTime ASC;
-- Fastest Lap by Track
SELECT circuits.name AS track,
MIN(results.fastestLapTime) AS fastest_lap
FROM
results
JOIN
races ON results.RaceId = races.RaceID
JOIN
drivers ON results.driverId = drivers.driverId
LEFT JOIN
circuits ON races.circuitId = circuits.circuitId
WHERE results.fastestLapTime AND circuits.name IS NOT NULL
GROUP BY circuits.name
ORDER BY fastest_lap ASC;
-- DRIVER PERFORMANCE ANALYSIS
-- Find career points leader (Hamilton)
SELECT
results.driverId,
SUM(results.points) AS total_points,
drivers.forename,
drivers.surname
FROM
results
JOIN
drivers ON results.driverId = drivers.driverId
GROUP BY results.driverId , drivers.forename , drivers.surname
ORDER BY total_points DESC;
-- Find highest avg points per race (Hamilton)
SELECT
results.driverId,
AVG(results.points) AS average_points,
COUNT(results.RaceId) AS total_races,
drivers.surname,
drivers.forename
FROM
results
JOIN
drivers ON results.driverId = drivers.driverId
GROUP BY results.driverId , drivers.surname, drivers.forename
ORDER BY average_points DESC;
-- We can a see a bias towards modern drivers, which we know to be true as the points system has changed many times
-- Find the driver with the highest average career finish (min 20 races) (Ascari)
SELECT
r.driverId,
AVG(NULLIF(CAST(r.position AS SIGNED), 0)) AS average_position,
COUNT(r.RaceId) AS total_races,
d.surname,
d.forename
FROM results r
JOIN drivers d ON r.driverId = d.driverId
GROUP BY r.driverId, d.surname, d.forename
HAVING total_races > 20
ORDER BY average_position ASC
LIMIT 20;
-- Finding the driver with the lowest average career finish (min 20 races) (Mazepin)
SELECT
r.driverId,
AVG(NULLIF(CAST(r.position AS SIGNED), 0)) AS average_position,
COUNT(r.RaceId) AS total_races,
d.surname,
d.forename
FROM results r
JOIN drivers d ON r.driverId = d.driverId
GROUP BY r.driverId, d.surname, d.forename
HAVING total_races > 20
ORDER BY average_position DESC;
-- Dividing drivers into quartiles based on avg career points achieved (min 20 points)
SELECT
driverId,
surname,
forename,
total_points,
experience_level,
NTILE(4) OVER (ORDER BY total_points DESC) AS performance_quartile,
AVG(total_points) OVER () AS ovr_avg_career_points
FROM (
SELECT
r.driverId,
d.surname,
d.forename,
SUM(r.points) AS total_points,
CASE
WHEN COUNT(r.RaceId) <= 50 THEN '1-50 races'
WHEN COUNT(r.RaceId) <= 100 THEN '51-100 races'
WHEN COUNT(r.RaceId) <= 150 THEN '101-150 races'
ELSE '151+ races'
END as experience_level
FROM results r
JOIN drivers d ON r.driverId = d.driverId
GROUP BY r.driverId, d.surname, d.forename
HAVING SUM(r.points) > 20
) AS QuartileData
ORDER BY total_points DESC;
-- Dividing drivers into quartiles based on avg career finish position (min 20 races)
SELECT
driverId,
surname,
forename,
average_position,
total_races,
experience_level,
NTILE(4) OVER (PARTITION BY experience_level ORDER BY average_position) AS performance_quartile,
AVG(average_position) OVER (PARTITION BY experience_level) AS ovr_avg_career_finish_position
FROM (
SELECT
r.driverId,
d.surname,
d.forename,
AVG(NULLIF(CAST(r.position AS SIGNED), 0)) AS average_position,
COUNT(r.RaceId) AS total_races,
CASE
WHEN COUNT(r.RaceId) <= 50 THEN '1-50 races'
WHEN COUNT(r.RaceId) <= 100 THEN '51-100 races'
WHEN COUNT(r.RaceId) <= 150 THEN '101-150 races'
ELSE '151+ races'
END as experience_level
FROM results r
JOIN drivers d ON r.driverId = d.driverId
WHERE r.position <> '0'
GROUP BY r.driverId, d.surname, d.forename
HAVING COUNT(r.RaceId) > 20
) AS QuartileData
ORDER BY experience_level, average_position ASC;
-- CONSTRUCTOR ANALYSIS
-- Finding the constructor with the highest average finish (min 20 races)
SELECT
results.constructorId,
AVG(NULLIF(CAST(results.position AS SIGNED), 0)) AS average_position,
COUNT(constructor_results.constructorResultsId) AS total_races
FROM
results
JOIN
constructor_results ON results.constructorId = constructor_results.constructorId
WHERE
results.position NOT IN ('DNF', 'DSQ') -- Exclude non-numeric values
GROUP BY
results.constructorId
HAVING
COUNT(results.RaceId) > 20
ORDER BY
average_position ASC;
-- Finding the constructor with the lowest average finish (min 20 races)
SELECT
results.constructorId,
AVG(NULLIF(CAST(results.position AS SIGNED), 0)) AS average_position,
COUNT(constructor_results.constructorResultsId) AS total_races
FROM
results
JOIN
constructor_results ON results.constructorId = constructor_results.constructorId
WHERE
results.position NOT IN ('DNF', 'DSQ') -- Exclude non-numeric values
GROUP BY
results.constructorId
HAVING
COUNT(results.RaceId) > 20
ORDER BY
average_position DESC;
-- Who is the greatest driver of all time?