Python爬取豆瓣电影排名

去年有一篇从豆瓣网上爬取最近上映的电影，网址：

Python爬取豆瓣电影

如今主要在研究机器学习的一些算法，Python代码基本上停产了，为了不对Python生疏，于是便从CSDN上寻找一些简单的Python代码练练手，下面是Python爬取豆瓣电影排名top250的代码，存入Excel中保存为csv文件。

使用到的模块requests和bs4.

代码：

import requests
import re
import os
from bs4 import BeautifulSoup


def download(url, page):
    html = requests.get(url).text   # 这里不加text返回<Response [200]>
    soup = BeautifulSoup(html, 'lxml')
    lis = soup.select("ol li")
    for li in lis:
        index = li.select_one("em").text
        title = li.select_one(".hd .title").text
        rating = li.select_one(".bd .star .rating_num").text
        strInfo = re.search("(?<=<br/>).*?(?=<)", str(li.select_one(".bd p")), re.S | re.M).group().strip()
        infos = strInfo.split('/')
        year = infos[0].strip()
        area = infos[1].strip()
        type = infos[2].strip()
        write_fo_file(index, title, rating, year, area, type)
    page += 25
    if page < 250:
        download("https://movie.douban.com/top250?start=%d&filter=" % page, page)


def write_fo_file(index, title, rating, year, area, type):
    f = open('douban_top250.csv', 'a')
    f.write("%s,%s,%s,%s,%s,%s\n" % (index, title, rating, year, area, type))
    f.close()


def main():
    if os.path.exists('douban_top250.csv'):
        os.remove('douban_top250.csv')

    url = 'https://movie.douban.com/top250'
    download(url, 0)
    print("爬取完毕。")


if __name__ == '__main__':
    main()

import requests

import re

import os

from bs4 import BeautifulSoup

def download(url, page):

html = requests.get(url).text # 这里不加text返回<Response [200]>

soup = BeautifulSoup(html, 'lxml')

lis = soup.select("ol li")

for li in lis:

index = li.select_one("em").text

title = li.select_one(".hd .title").text

rating = li.select_one(".bd .star .rating_num").text

strInfo = re.search("(?<=<br/>).*?(?=<)", str(li.select_one(".bd p")), re.S | re.M).group().strip()

infos = strInfo.split('/')

year = infos[0].strip()

area = infos[1].strip()

type = infos[2].strip()

write_fo_file(index, title, rating, year, area, type)

page += 25

if page < 250:

download("https://movie.douban.com/top250?start=%d&filter=" % page, page)

def write_fo_file(index, title, rating, year, area, type):

f = open('douban_top250.csv', 'a')

f.write("%s,%s,%s,%s,%s,%s\n" % (index, title, rating, year, area, type))

f.close()

def main():

if os.path.exists('douban_top250.csv'):

os.remove('douban_top250.csv')

url = 'https://movie.douban.com/top250'

download(url, 0)

print("爬取完毕。")

if __name__ == '__main__':

main()

参考：https://download.csdn.net/download/guliang21/10586157

效果：

Python爬取豆瓣电影排名

大模型AlpacaFarm分析

NLG文本评估任务或许并不需要真值或参考文本

大模型中的RepE表征工程

大模型也是一种优化器（LLM as Optimizer）

全栈开发与快速部署Demo

学术idea自动发现与生成

自回归语言模型（language model）Python实现

粉丝期待的三体电影宇宙（近四十部电影与电视剧集）

基于历史对比学习的时序知识图谱推理

泰拉瑞亚Terriaria快速部署Linux服务器

留下评论取消回复

相关文章

留下评论取消回复