@Robin_TY老师第九章案例8空气质量指数计算v7.0获取所有城市aqi只能获取前面十几个城市,并且很多报错(老师的源码也没法正常运行)

import requests
from bs4 import BeautifulSoup


def get_city_aqi(city_pinyin):
    """
        获取城市的AQI
    """
    url = 'http://pm25.in/' + city_pinyin
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all('div', {'class': 'span1'})

    city_aqi = []
    for i in range(8):
        div_content = div_list[i]
        caption = div_content.find('div', {'class': 'caption'}).text.strip()
        value = div_content.find('div', {'class': 'value'}).text.strip()

        city_aqi.append((caption, value))
    return city_aqi


def get_all_cities():
    """
        获取所有城市
    """
    url = 'http://pm25.in/'
    city_list = []
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')

    city_div = soup.find_all('div', {'class': 'bottom'})[1]
    city_link_list = city_div.find_all('a')
    for city_link in city_link_list:
        city_name = city_link.text
        city_pinyin = city_link['href'][1:]
        city_list.append((city_name, city_pinyin))
    return city_list


def main():
    """
        主函数
    """
    city_list = get_all_cities()
    for city in city_list:
        city_name = city[0]
        city_pinyin = city[1]
        city_aqi = get_city_aqi(city_pinyin)
        print(city_name, city_aqi)


if __name__ == '__main__':
    main()
 
 
Traceback (most recent call last):
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 360, in _error_catcher
    yield
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 669, in read_chunked
    chunk = self._handle_chunk(amt)
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 624, in _handle_chunk
    returned_chunk = self._fp._safe_read(self.chunk_left)
  File "D:\Anaconda3\lib\http\client.py", line 620, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "D:\Anaconda3\lib\socket.py", line 589, in readinto
    return self._sock.recv_into(b)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Anaconda3\lib\site-packages\requests\models.py", line 750, in generate
    for chunk in self.raw.stream(chunk_size, decode_content=True):
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 490, in stream
    for line in self.read_chunked(amt, decode_content=decode_content):
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 694, in read_chunked
    self._original_response.close()
  File "D:\Anaconda3\lib\contextlib.py", line 130, in __exit__
    self.gen.throw(type, value, traceback)
  File "D:\Anaconda3\lib\site-packages\urllib3\response.py", line 365, in _error_catcher
    raise ReadTimeoutError(self._pool, None, 'Read timed out.')
urllib3.exceptions.ReadTimeoutError: HTTPConnectionPool(host='pm25.in', port=80): Read timed out.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:/DevProjects/python/python_course/lect08/aqi_v7.0.py", line 61, in <module>
    main()
  File "D:/DevProjects/python/python_course/lect08/aqi_v7.0.py", line 56, in main
    city_aqi = get_city_aqi(city_pinyin)
  File "D:/DevProjects/python/python_course/lect08/aqi_v7.0.py", line 16, in get_city_aqi
    r = requests.get(url, timeout=30)
  File "D:\Anaconda3\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "D:\Anaconda3\lib\site-packages\requests\api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "D:\Anaconda3\lib\site-packages\requests\sessions.py", line 533, in request
    resp = self.send(prep, **send_kwargs)
  File "D:\Anaconda3\lib\site-packages\requests\sessions.py", line 686, in send
    r.content
  File "D:\Anaconda3\lib\site-packages\requests\models.py", line 828, in content
    self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
  File "D:\Anaconda3\lib\site-packages\requests\models.py", line 757, in generate
    raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='pm25.in', port=80): Read timed out.

Process finished with exit code 1

微联商务

赞同来自:

#我也刚学完这一章节网站更新了,但索引查找方法没变,思考一下想一想用别的方式爬取想要的东西,这些代码也是爬取城市排名,方法一样只是爬取的地址不同,原理相同,希望有所启发


import requests
from bs4 import BeautifulSoup
import csv

def process_city_rank(url):
    """
    访问网站并编辑

    """

    r = requests.get(url,timeout = 30)
    soup = BeautifulSoup(r.text,'lxml')

    aqi_city_rank = 
    #用find_all找到页面编辑
    """
    
    注意:当找到并抓取文件名后,所输出的是所有关于此文件下的内容和子文件,若进一步精细爬取子文件(也还可包含子文件)
    时,需要利用 for循环 来进一步索引到每一行的爬取状态,只有这样才能得到 "每个"所需的目标数值或字符串
    
    """
    total_text = soup.find_all('ul',{'class':'pj_area_data_details rank_box'})[0]
    x = total_text.find_all('span',{'class':'pjadt_ranknum'})
    z = total_text.find_all('a')
    l = total_text.find_all('span',{'class':'pjadt_aqi'})
    for i in range(352):
        line1 = x
        line2 = z
        line3 = l
        y = line1.string
        e = line2.text
        f = line3.text
        aqi_city_rank.append((y,e,f))
    return aqi_city_rank


def main():

    url = 'http://www.pm25.com/rank.html'

    aqi_city = process_city_rank(url)
    heater = ['排名','城市名','aqi']

    with open('city_rank.csv','w',encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(heater)
        for i,line in enumerate(aqi_city):
            if (i + 1) == 0:
                print('已处理{}条,共{}条记录。'.format(i + 1,len(aqi_city)))
            rank = line[0]
            name = line[1]
            aqi = line[2]
            print('排名:{}'.format(rank), '城市:{}'.format(name), 'aqi={}'.format(aqi))
            writer.writerow([rank,name,aqi])



if __name__== '__main__':
    main()

要回复问题请先登录注册