2017中国城市空气质量指数(aqi)爬取及分析

引言

2016是我来北京第一年,这年年末和2017年的年初,雾霾在这个城市肆虐了一个冬天,还记得那时候每天出门都要带着口罩,卡着眼镜戴的特别难受。在公司里大家也时不时的讨论着空气质量、哪款口罩质量比较好、中午要不要出去吃饭等等与雾霾有关的话题。本科四年在哈尔滨,那时候冬天的雾虽然存在,但也没有让人感到窒息,研究生在武汉的三年也未见满大街都带着口罩的壮观景象。所以当时的想法是,连基本的生存都要成问题,还谈何工作、生活?既然无法改变空气质量,那可以做的就是选择一个空气质量说得过去的城市。于是选择了十个城市,在空气质量网站上爬了2017年连续一年的空气质量,分别是北京、上海、杭州、珠海、深圳、广州、成都、苏州、厦门、哈尔滨。这些城市主要是以经济发展空间这个维度来选择的,个人认为也是我们这代人比较有倾向性的久居城市(哈尔滨是hometown,纯属关心;武汉呆了几年,觉得以后再也不会去了)。
爬虫代码在服务器上crontab每五分钟一次定时执行,最终结果最小粒度为小时,个别时间因为不明原因数据缺失。

爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/python
#-*-coding:utf-8 -*-
# crawl.py
import urllib
import re
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html):
reg = r'src="(.+?\.png)" class'
reg_time = r'<h4>更新时间 (.+?)</h4>'
imgre = re.compile(reg)
titlere = re.compile(reg_time)
imgurl = re.findall(imgre,html)[0]
title = re.findall(titlere, html)[0][0:13]
urllib.urlretrieve(imgurl,'./aqi/%s.png' %title)
def getaqi(html, file):
reg = r'aqi-bg aqi-level-(.+?)\">(.+?)</span>'
reg_time = r'label label-info\">(.+?)发布</span>'
aqire = re.compile(reg)
timere = re.compile(reg_time)
aqiurl = re.findall(aqire,html)[0][1]
index = aqiurl.split(" ")[0]
quality = aqiurl.split(" ")[1]
time = re.findall(timere, html)[0].replace("年","-").replace("月","-").replace("日","")
with open(file, 'r') as f:
lines = f.readlines()
last_line = lines[-1]
if (last_line[0:16] != time):
with open(file, "a") as f:
f.write('\n' + time + '\t' + index + '\t' + quality)
if __name__ == '__main__':
#get img
htmlimg = getHtml("http://www.air-level.com/")
getImg(htmlimg)
#get aqi
cityarr = ("shanghai","beijing","hangzhou","zhuhai","shenzhen","guangzhou","chengdu","haerbin","suzhou","xiamen")
for i in cityarr:
htmlaqi = getHtml("http://www.air-level.com/air/%s/" %i)
getaqi(htmlaqi, "./aqi/%s" %i)

分析

1
2
3
4
5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
1
city_lst = ["beijing", "chengdu", "guangzhou", "haerbin", "hangzhou", "shanghai", "shenzhen", "suzhou", "xiamen", "zhuhai"]
1
2
3
4
5
6
7
8
9
10
11
12
13
df_lst = []
for city in city_lst:
tmp = pd.read_table(city, names = ["tm", "aqi_" + city, "lv_" + city])
tmp["tm"] = pd.to_datetime(tmp["tm"].apply(lambda x: x + ":00"))
df_lst.append(tmp)
timeIndex = pd.date_range("2017-01-07 00:00", "2018-01-06 23:00", freq="H")
timeIndex = timeIndex.to_series().to_frame().reset_index(drop = True)
timeIndex.columns = ["tm"]
df = pd.merge(timeIndex, df_lst[0], how = 'left', on = 'tm')
for i in range(1, len(city_lst)):
df = pd.merge(df, df_lst[i], how = 'left', on = 'tm')
1
2
3
4
5
6
7
8
# hour to day (daily mean)
def day_scale(df):
df_scale = pd.DataFrame()
df["date"] = df.tm.dt.date
df_scale["date"] = df["date"].unique()
for col in [col for col in df.columns.tolist() if "aqi" in col]:
df_scale[col] = df.groupby(["date"])[col].mean().reset_index()[col]
return df_scale
1
2
3
4
5
6
7
8
# hour to month (monthly mean)
def month_scale(df):
df_scale = pd.DataFrame()
df["month"] = df.index.map(lambda x: str(int(x / df.shape[0] * 12) + 1))
df_scale["month"] = df["month"].unique()
for col in [col for col in df.columns.tolist() if "aqi" in col]:
df_scale[col] = df.groupby(["month"])[col].mean().reset_index()[col]
return df_scale
1
2
3
4
5
# hour scale
plt.figure(figsize=[20,10])
for city in city_lst:
plt.plot(df["tm"], df["aqi_" + city])
plt.legend(prop={'size':16})
1
2
3
4
5
# day scale
plt.figure(figsize=[20,10])
for city in city_lst:
plt.plot(day_scale(df)["date"], day_scale(df)["aqi_" + city])
plt.legend(prop={'size':16})
1
2
3
4
5
# month scale
plt.figure(figsize=[20,10])
for city in city_lst:
plt.plot(month_scale(df)["month"], month_scale(df)["aqi_" + city])
plt.legend(prop={'size':16})
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Moving Average
df_day = day_scale(df)
df_day.fillna(df_day.median(axis = 0), inplace=True) #should be no none value when using moving average method
plt.figure(figsize=[20,10])
plt.title("30 days moving average", fontsize = 18)
for city in city_lst:
plt.plot(df_day["date"], df_day["aqi_" + city].to_frame().rolling(30).mean())
plt.legend(city_lst, prop = {'size':16})
plt.figure(figsize=[20,10])
plt.title("60 days moving average", fontsize = 18)
for city in city_lst:
plt.plot(df_day["date"], df_day["aqi_" + city].to_frame().rolling(60).mean())
plt.legend(city_lst, prop = {'size':16})
1
2
3
4
plt.figure(figsize=[20,10])
sns.boxplot(df[[col for col in df.columns if "aqi" in col]])
plt.ylim([0,200])
plt.xticks(fontsize = 16)
1
2
lvIndex = pd.DataFrame(df.lv_beijing.dropna().unique()).reset_index(drop = True)
lvIndex.columns = ["lvIndex"]
1
2
3
4
5
6
7
8
df_lv = df["lv_" + city_lst[0]].value_counts().reset_index()
for i in range(1, len(city_lst)):
df_lv = pd.merge(df_lv, df["lv_" + city_lst[i]].value_counts().reset_index(), how = 'left', on = 'index')
df_lv = df_lv.fillna(0)
columns = df_lv["index"].map({"优":"lv_0", "良":"lv_1", "轻度污染":"lv_2", "中度污染":"lv_3", "重度污染":"lv_4", "严重污染":"lv_5", "极度污染":"lv_6"})
df_lv = df_lv.transpose().drop(["index"])
df_lv.columns = columns
df_lv.plot(kind = 'bar', stacked = True, figsize = [20,10], fontsize = 16)

结论:

  1. 滑动平均结果显示,夏季空气质量优于冬季
  2. 箱型图显示,南方沿海城市空气质量优于北方,且异常值很少且不会很异常,即,再差也差不到哪去
  3. 北京2017年冬季空气质量有明显好转,与相关政策干预河北省有关
  4. 哈尔滨和北京半斤八两,夏季哈尔滨是赢家,冬季北京是赢家。极度恶劣空气状况频率哈尔滨是赢家
  5. 在这个政策主导和经纬度跨度极大的国家,空气质量受到很多因素的影响,过去一年的统计结果未来未必依然具有说服力

(待更新)
全年中国空气质量分布图的动态展示