本文共 3049 字,大约阅读时间需要 10 分钟。
这里就直接上代码,实现了将数据print打印出控制台、将数据存入Excel表格
mysql数据储存仅供思路import requestsimport xlwtfrom bs4 import BeautifulSoupimport DBUtilsdef getCid(): hd = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/21"} url = "http://top.baidu.com/boards?fr=topindex" page = requests.get(url,headers=hd).content.decode("GBK") soup = BeautifulSoup(page, 'html.parser') dIv = soup.find('div', attrs={ 'class':"links"}) a = dIv.find_all('a') code = [] type_L = ["爱情","喜剧","惊悚","科幻","剧情"] for i in range(0,len(a)-1): src = a[i]['href'] code.append(src.strip('.')) return code,type_Ldef getItem(page,type_L): hd = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/21"} url = "http://top.baidu.com%s" url = url%page page = requests.get(url,headers=hd).content.decode("GBK") soup = BeautifulSoup(page, 'html.parser') type_M = type_L code = [] tBody = soup.find('table',attrs={ 'class':"list-table"}) name_M = tBody.find_all('td',attrs={ 'class':"keyword"}) top_M = tBody.find_all('td', attrs={ 'class': "first"}) score_M = tBody.find_all('td', attrs={ 'class': "last"}) for i in range(0,len(name_M)): code.append(dict(类型=type_M,电影名称=name_M[i].a.text, 排名=top_M[i].span.text,分数=score_M[i].span.text)) return codedef mySql(codex): sqlk = "INSERT INTO tb_movie(类型,名称,排名,分数) VALUES ('%s','%s','%s','%s');" sql_list = [] for i in range(0,len(codex)): stm = (codex[i]['类型'],codex[i]['电影名称'],codex[i]['排名'],codex[i]['分数']) sql_list.append(sqlk%stm) DBUtils.insertDataAMX(sql_list) # DBUtils.insertDataMany(sqlk,sql_list) # print(sql)def saveExcel(result): wb = xlwt.Workbook() # 创建工作表 sheet = wb.add_sheet('风云') list_h = ['类型','电影名称','排名','分数'] for row,str in enumerate(list_h): sheet.write(0, row, str) for i in range(0,len(result)): for n,inV in enumerate(list_h): sheet.gwrite(i+1,n,result[i][inV]) wb.save('风云爬.xlsx')url_list = getCid()codex = []for i in range(0,len(url_list[0])): codex.extend(getItem(url_list[0][i],url_list[1][i]))# saveExcel(codex)# mySql(codex)
在这之中,调用的sql方法
def getConnect(): conn = pymysql.connect(host="", user="root", password="123", database="pymysql", charset="utf8") return conndef closeConnect(cursor, conn): if cursor: cursor.close() if conn: conn.close()def insertDataAMX(sql_list): conn = getConnect() ##获取连接 cursor = conn.cursor() ##创建游标(新建查询会话),通过游标执行SQL语句 for sql in sql_list: cursor.execute(sql) # 将SQL语句放入游标中,准备执行 conn.commit() ##提交,真正的执行 closeConnect(cursor, conn) ##关闭 count = cursor.rowcount if count > 0: return True else: return False
转载地址:http://wbuhf.baihongyu.com/