@Robin_TY老师第九章案例8空气质量指数计算v7.0获取所有城市aqi只能获取前面十几个城市,并且很多报错(老师的源码也没法正常运行)

[code=Java]import requests
from bs4 import BeautifulSoup


def get_city_aqi(city_pinyin):
"""
获取城市的AQI
"""
url = 'http://pm25.in/' + city_pinyin
r = requests.get(url, timeout=30)
soup = BeautifulSoup(r.text, 'lxml')
div_list = soup.find_all('div', {'class': 'span1'})

city_aqi = []
for i in range(8):
div_content = div_list[i]
caption = div_content.find('div', {'class': 'caption'}).text.strip()
value = div_content.find('div', {'class': 'value'}).text.strip()

city_aqi.append((caption, value))
return city_aqi


def get_all_cities():
"""
获取所有城市
"""
url = 'http://pm25.in/'
city_list = []
r = requests.get(url, timeout=30)
soup = BeautifulSoup(r.text, 'lxml')

city_div = soup.find_all('div', {'class': 'bottom'})[1]
city_link_list = city_div.find_all('a')
for city_link in city_link_list:
city_name = city_link.text
city_pinyin = city_link['href'][1:]
city_list.append((city_name, city_pinyin))
return city_list


def main():
"""
主函数
"""
city_list = get_all_cities()
for city in city_list:
city_name = city[0]
city_pinyin = city[1]
city_aqi = get_city_aqi(city_pinyin)
print(city_name, city_aqi)


if __name__ == '__main__':
main()[/code][code=Java]
 [/code][code=Java]
 [/code][code=Java]Traceback (most recent call last):
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 360, in _error_catcher
    yield
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 669, in read_chunked
    chunk = self._handle_chunk(amt)
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 624, in _handle_chunk
    returned_chunk = self._fp._safe_read(self.chunk_left)
  File "D:\Anaconda3\lib\http\client.py", line 620, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "D:\Anaconda3\lib\socket.py", line 589, in readinto
    return self._sock.recv_into(b)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Anaconda3\lib\site-packages\requests\models.py", line 750, in generate
    for chunk in self.raw.stream(chunk_size, decode_content=True):
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 490, in stream
    for line in self.read_chunked(amt, decode_content=decode_content):
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 694, in read_chunked
    self._original_response.close()
  File "D:\Anaconda3\lib\contextlib.py", line 130, in __exit__
    self.gen.throw(type, value, traceback)
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 365, in _error_catcher
    raise ReadTimeoutError(self._pool, None, 'Read timed out.')
urllib3.exceptions.ReadTimeoutError: HTTPConnectionPool(host='pm25.in', port=80): Read timed out.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:/DevProjects/python/python_course/lect08/aqi_v7.0.py", line 61, in
    main()
  File "D:/DevProjects/python/python_course/lect08/aqi_v7.0.py", line 56, in main
    city_aqi = get_city_aqi(city_pinyin)
  File "D:/DevProjects/python/python_course/lect08/aqi_v7.0.py", line 16, in get_city_aqi
    r = requests.get(url, timeout=30)
  File "D:\Anaconda3\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "D:\Anaconda3\lib\site-packages\requests\api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "D:\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
    resp = self.send(prep, **send_kwargs)
  File "D:\Anaconda3\lib\site-packages\requests\sessions.py", line 686, in send
    r.content
  File "D:\Anaconda3\lib\site-packages\requests\models.py", line 828, in content
    self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
  File "D:\Anaconda3\lib\site-packages\requests\models.py", line 757, in generate
    raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='pm25.in', port=80): Read timed out.

Process finished with exit code 1
[/code]
已邀请:
[code=Java]#我也刚学完这一章节网站更新了,但索引查找方法没变,思考一下想一想用别的方式爬取想要的东西,这些代码也是爬取城市排名,方法一样只是爬取的地址不同,原理相同,希望有所启发


import requests
from bs4 import BeautifulSoup
import csv

def process_city_rank(url):
"""
访问网站并编辑

"""

r = requests.get(url,timeout = 30)
soup = BeautifulSoup(r.text,'lxml')

aqi_city_rank =
#用find_all找到页面编辑
"""

注意:当找到并抓取文件名后,所输出的是所有关于此文件下的内容和子文件,若进一步精细爬取子文件(也还可包含子文件)
时,需要利用 for循环 来进一步索引到每一行的爬取状态,只有这样才能得到 "每个"所需的目标数值或字符串

"""
total_text = soup.find_all('ul',{'class':'pj_area_data_details rank_box'})[0]
x = total_text.find_all('span',{'class':'pjadt_ranknum'})
z = total_text.find_all('a')
l = total_text.find_all('span',{'class':'pjadt_aqi'})
for i in range(352):
line1 = x
line2 = z
line3 = l
y = line1.string
e = line2.text
f = line3.text
aqi_city_rank.append((y,e,f))
return aqi_city_rank


def main():

url = 'http://www.pm25.com/rank.html'

aqi_city = process_city_rank(url)
heater = ['排名','城市名','aqi']

with open('city_rank.csv','w',encoding = 'utf-8') as f:
writer = csv.writer(f)
writer.writerow(heater)
for i,line in enumerate(aqi_city):
if (i + 1) == 0:
print('已处理{}条,共{}条记录。'.format(i + 1,len(aqi_city)))
rank = line[0]
name = line[1]
aqi = line[2]
print('排名:{}'.format(rank), '城市:{}'.format(name), 'aqi={}'.format(aqi))
writer.writerow([rank,name,aqi])



if __name__== '__main__':
main()[/code]

要回复问题请先登录注册