import requests
import re
import os
from bs4 import BeautifulSoup
def download(url, page):
html = requests.get(url).text # 这里不加text返回<Response [200]>
soup = BeautifulSoup(html, 'lxml')
lis = soup.select("ol li")
for li in lis:
index = li.select_one("em").text
title = li.select_one(".hd .title").text
rating = li.select_one(".bd .star .rating_num").text
strInfo = re.search("(?<=<br/>).*?(?=<)", str(li.select_one(".bd p")), re.S | re.M).group().strip()
infos = strInfo.split('/')
year = infos[0].strip()
area = infos[1].strip()
type = infos[2].strip()
write_fo_file(index, title, rating, year, area, type)
page += 25
if page < 250:
download("https://movie.douban.com/top250?start=%d&filter=" % page, page)
def write_fo_file(index, title, rating, year, area, type):
f = open('douban_top250.csv', 'a')
f.write("%s,%s,%s,%s,%s,%s\n" % (index, title, rating, year, area, type))
f.close()
def main():
if os.path.exists('douban_top250.csv'):
os.remove('douban_top250.csv')
url = 'https://movie.douban.com/top250'
download(url, 0)
print("爬取完毕。")
if __name__ == '__main__':
main()