- Count the number of requests created on March 13, 2017.
- Specify the upper bound by adding 1 to the lower bound. (To specify the upper bound add 1 to the lower bound)
-- Count requests created on March 13, 2017
SELECT count(*)
FROM evanston311
WHERE date_created >= '2017-03-13'
AND date_created < '2017-03-13'::date + 1;
Want to master SQL ? go through this interview problems:
-- Add 100 days to the current timestamp
SELECT now()+'100 days'::interval;
-- Select the current timestamp,
-- and the current timestamp + 5 minutes
SELECT now(), now()+ '5 minutes':: interval;
-- Select the category and the average completion time by category
SELECT category,
avg(date_completed - date_created) AS completion_time
FROM evanston311
Group By category
-- Order the results
Order by completion_time desc;
date_part and finding monthly and yearly counts
-- Extract the month from date_created and count requests
SELECT date_part('month', date_created) AS month,
count(*)
FROM evanston311
-- Limit the date range
WHERE date_created >= '2016-01-01'
AND date_created < '2017-12-31'
-- Group by what to get monthly counts?
GROUP BY month;
date_part and counts hourly.
-- Get the hour and count requests
SELECT date_part('hour', date_created) AS hour,
count(*)
FROM evanston311
GROUP BY hour
-- Order results to select most common
ORDER BY hour
LIMIT 1;
Does the time required to complete a request vary by the day of the week on which the request was created?
We can get the name of the day of the week by converting a timestamp to character data:
to_char(date_created, 'day')
But character names for the days of the week sort in alphabetical,
not chronological, order. To get the chronological order of days of the
week with an integer value for each day, we can use:
EXTRACT(DOW FROM date_created)
DOW
stands for "day of week."
-- Select name of the day of the week the request was created
SELECT to_char(date_created, 'day') AS day,
-- Select avg time between request creation and completion
avg(date_completed - date_created) AS duration
FROM evanston311
-- Group by the name of the day of the week and
-- integer value of day of week the request was created
GROUP BY day, EXTRACT(dow from date_created)
-- Order by integer value of the day of the week
-- the request was created
ORDER BY EXTRACT(dow from date_created);
Date truncation
Unlike date_part()
or EXTRACT()
, date_trunc()
keeps date/time units larger than the field you specify as part of the
date. So instead of just extracting one component of a timestamp, date_trunc()
returns the specified unit and all larger ones as well.
Recall the syntax:
date_trunc('field', timestamp)
Using date_trunc()
, find the average number of Evanston
311 requests created per day for each month of the data. Ignore days
with no requests when taking the average.
-- Aggregate daily counts by month
SELECT date_trunc('month', day) AS month,
avg(count)
-- Subquery to compute daily counts
FROM (SELECT date_trunc('day', date_created) AS day,
count(*) AS count
FROM evanston311
GROUP BY day) AS daily_count
GROUP BY month
ORDER BY month;
Aggregating series
WITH hour_series AS (
SELECT generate_series ( '2018-04-23 09:00:00, -- 9pm
'2018-04-23 14:00:00, --2pm
'1 hour'::interval)AS hours) -- I have created a column with hours
--Hours from series, count date (not *) to count non-null value.
SELECT hours, count(date)
--join series to sales data
from hour_series
left join sales
on hours= date_trunc('hour',date)
-- I AM JOINING THE TABLES BECAUSE I WANT TO GET THE HOURS AND DATES BOTH AND SEE THE TOTAL NUMBER OF APPLICATION SENT ON THE DAY.
Group by hours
Order by hours;
SELECT day
-- 1) Subquery to generate all dates
-- from min to max date_created
FROM (SELECT generate_series(min(date_created),
max(date_created),
'1 day')::date AS day
-- What table is date_created in?
FROM evanston311) AS all_dates
-- 4) Select dates (day from above) that are NOT IN the subquery
WHERE day Not IN
-- 2) Subquery to select all date_created values as dates
(SELECT date_created::date
FROM evanston311);
logic:
--show me dates that
are in my created table but not in the original table.
Find the median number of Evanston 311 requests per day in each six
month period from 2016-01-01 to 2018-06-30. Build the query following
the three steps below.
-- Bins from Step 1
WITH bins AS (
SELECT generate_series('2016-01-01',
'2018-01-01',
'6 months'::interval) AS lower,
generate_series('2016-07-01',
'2018-07-01',
'6 months'::interval) AS upper),
-- Daily counts from Step 2
daily_counts AS (
SELECT day, count(date_created) AS count
FROM (SELECT generate_series('2016-01-01',
'2018-06-30',
'1 day'::interval)::date AS day) AS daily_series
LEFT JOIN evanston311
ON day = date_created::date
GROUP BY day)
-- Select bin bounds
SELECT lower,
upper,
-- Compute median of count for each bin
percentile_disc(0.5) WITHIN GROUP (ORDER BY count) AS median
-- Join bins and daily_counts
FROM bins
LEFT JOIN daily_counts
-- Where the day is between the bin bounds
ON day >= lower
AND day < upper
-- Group by bin bounds
GROUP BY lower, upper
ORDER BY lower;
results table---
Monthly average with missing dates
Find the average number of Evanston 311 requests created per day for each month of the data.
This time, do not ignore dates with no requests.
Instructions
- Generate a series of dates from 2016-01-01 to 2018-06-30.
- Join the series to a subquery to count the number of requests created per day.
- Use
date_trunc()
to get months from date
, which has all dates, NOT day
. Always pick the name of the column. I made mistakes of picking the table name which is all_days.
- Use
coalesce()
to replace NULL count
values with 0. Compute the average of this value. Take the count to run the operation on.
-- generate series with all days from 2016-01-01 to 2018-06-30
WITH all_days AS
(SELECT generate_series('2016-01-01',
'2018-06-30',
'1 day'::interval) AS date),
-- Subquery to compute daily counts
daily_count AS
(SELECT date_trunc('day', date_created) AS day,
count(*) AS count
FROM evanston311
GROUP BY day)
-- Aggregate daily counts by month using date_trunc
SELECT date_trunc('month', date) AS month,
-- Use coalesce to replace NULL count values with 0
avg(coalesce(count, 0)) AS average
FROM all_days
LEFT JOIN daily_count
-- Joining condition
ON all_days.date=daily_count.day
GROUP BY month
ORDER BY month;
Longest gap
What is the longest time between Evanston 311 requests being submitted?
Recall the syntax for lead()
and lag()
:
lag(column_to_adjust) OVER (ORDER BY ordering_column)
lead(column_to_adjust) OVER (ORDER BY ordering_column)
- Select
date_created
and the date_created
of the previous request using lead()
or lag()
as appropriate. - Compute the gap between each request and the previous request.
- Select the row with the maximum gap.
-- Compute the gaps
WITH request_gaps AS (
SELECT date_created,
-- lead or lag
lag(date_created) OVER (order by date_created) AS previous,
-- compute gap as date_created minus lead or lag
date_created - lag(date_created) OVER (order by date_created) AS gap
FROM evanston311)
-- Select the row with the maximum gap
SELECT *
FROM request_gaps
-- Subquery to select maximum gap from request_gaps
WHERE gap = (SELECT max(gap)
FROM request_gaps);
Rats!
Requests in category "Rodents- Rats" average over 64 days to resolve. Why?
Investigate in 4 steps:
Why is the average so high? Check the distribution of completion times. Hint: date_trunc()
can be used on intervals.
See how excluding outliers influences average completion times.
Do requests made in busy months take longer to complete? Check the correlation between the average completion time and requests per month.
Compare the number of requests created per month to the number completed.
Remember: the time to resolve, or completion time, is date_completed - date_created
.
SELECT category,
-- Compute average completion time per category
avg(date_completed - date_created) AS avg_completion_time
FROM evanston311
-- Where completion time is less than the 95th percentile value
WHERE date_completed - date_created <
-- Compute the 95th percentile of completion time in a subquery
(SELECT percentile_disc(0.95) WITHIN GROUP (ORDER BY date_completed - date_created)
FROM evanston311)
GROUP BY category
-- Order the results
ORDER BY avg_completion_time DESC;
- Get
corr()
between avg. completion time and monthly requests. EXTRACT(epoch FROM interval)
returns seconds in interval
.
-- Compute correlation (corr) between
-- avg_completion time and count from the subquery
SELECT corr(avg_completion, count)
-- Convert date_created to its month with date_trunc
FROM (SELECT date_trunc('month', date_created) AS month,
-- Compute average completion time in number of seconds
avg(EXTRACT(epoch FROM date_completed - date_created)) AS avg_completion,
-- Count requests per month
count(*) AS count
FROM evanston311
-- Limit to rodents
WHERE category='Rodents- Rats'
-- Group by month, created above
GROUP BY month)
-- Required alias for subquery
AS monthly_avgs;
- Select the number of requests created and number of requests completed per month.
-- Compute monthly counts of requests created
WITH created AS (
SELECT date_trunc('month', date_created) AS month,
count(*) AS created_count
FROM evanston311
WHERE category='Rodents- Rats'
GROUP BY month),
-- Compute monthly counts of requests completed
completed AS (
SELECT date_trunc('month', date_completed) AS month,
count(*) AS completed_count
FROM evanston311
WHERE category='Rodents- Rats'
GROUP BY month)
-- Join monthly created and completed counts
SELECT created.month,
created_count,
completed_count
FROM created
INNER JOIN completed
ON created.month=completed.month
ORDER BY created.month;
This sequel ends here... see you in the next round.
No comments:
Post a Comment