説明なし

rov8.py 7.3KB

    #!/usr/bin/env python # coding: utf-8 import base64 import json from time import strftime, localtime import urllib2 import requests from bs4 import BeautifulSoup from lxml import etree import time import mysql """ 2、 然后获取http://www.rov8.com/index.php/Times/index/p/number.html所有的详情url。(注意是分页) 3、 获取字幕报页面我们想要的信息,然后抓取出来。 4、 将获取的字段存储到mysql中。 所需字段 1、片名 film_title 2 出字幕时间 caption_time 3、影片时长 film_duration 4、影片类型 film_type 5、语言版本 language_version 6、编码格式 macro_format 7、DCP包大小 scp_size """ # 模拟浏览器的访问 req_header = {'User-Agent': 'Mozilla/5.0 (Macintosh;' ' Intel Mac OS X 10_11_6) ' 'AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/59.0.3071.115 Safari/537.36'} class ROV: def __init__(self): self.base_url = 'http://www.rov8.com/index.php/Times/index/p/' self.req_header = req_header self.mysql = mysql.Mysql() # 获取当前时间 def getCurrentTime(self): return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time())) # 获取当前时间 def getCurrentDate(self): return time.strftime('%Y-%m-%d', time.localtime(time.time())) def getHtml(self, targetUrl): req = urllib2.Request(targetUrl, "", headers=req_header) response = urllib2.urlopen(req) return response.read().decode('utf-8') def getContents(self, pageNum): """将传来的页码中每一个具体影片的url提取出来""" url = self.base_url + str(pageNum) + '.html' html = urllib2.urlopen(url) #将网页内容转码为utf-8 htmldata = html.read().decode('utf-8') detailPath = etree.HTML(htmldata) urls = detailPath.xpath('//div[@class="content-zw"]/ul/li/a/@href') for url in urls: self.getDetailPage('http://www.rov8.com/' + url) def getDetailPage(self, getDetailPageUrl): """ 通过传来的具体影片的url,获取详情页的主要信息 """ htmldata = self.getHtml(getDetailPageUrl) soup = BeautifulSoup(htmldata, 'lxml') film_title = soup.find('td', attrs={"class": "Atitle"}).get_text() div = soup.find('div', attrs={"class": "art-cnt"}).get_text().encode("utf-8") self.getFiled(div, film_title) def getCaptionTimeByFilmTitle(self, film_name): """ 不用登录,通过影片名称获取出字幕时间 """ filmName = self.subString(film_name) todaytime = strftime("%Y-%m-%d", localtime()) salt = "FYZS_sub|" + filmName + "|" + todaytime searchcode = base64.encodestring(base64.encodestring(salt)) url = "http://www.rov8.com/fy/API.php?a=rov2018&b=GFYZS&c=skm&d=" + searchcode r = requests.get(url) date = base64.decodestring(base64.decodestring(r.text)) info = json.loads(date)[0] captionTime = info[2][:8] result = "" if captionTime[0] != "0": result = u'别着急,还木有出来呢~~' else: result = captionTime return result def getFiled(self, div, film_title): """ 通过传来的参数将所需字段过滤出来 """ captionTime = self.getCaptionTimeByFilmTitle(film_title) caption_time = '出字幕时间:'.decode('utf-8') + captionTime film_name = '影片名称:'.decode('utf-8') + film_title long_biao = '龙标:'.decode('utf-8') + '暂无' dcp_name = 'DCP包名称:""' results = [] datalist = div.splitlines() for j in range(0, len(datalist)): str = datalist[j].strip() if len(str) != 0: nPos = str.find(' ') if nPos == -1: nPos = len(str) results.append(str[0:nPos]) # 从抓取的div中找到所需的字段值 rlists = [] rlists.append(film_name) rlists.append(caption_time) for str in results: if str.find('影片时长') != -1 and len(str) < 24: rlists.append(str) elif str.find('影片类型') != -1: rlists.append(str) elif str.find('语言版本') != -1: if len(str) < 16: rlists.append('语言版本:无') else: rlists.append(str) elif str.find('编码格式') != -1: rlists.append(str) elif str.find('影片画幅') != -1: rlists.append(str) elif str.find('色彩空间') != -1: rlists.append(str) elif str.find('影片声道') != -1: rlists.append(str) elif str.find('DCP包大小') != -1: rlists.append(str) break else: pass rlists.append(dcp_name) rlists.append(long_biao) print '*' * 80 # for i in rlists: # print i self.list2dict(rlists) def list2dict(self, lists): """ 将列表转化为字典,方便mysql插入 """ tmp1 = [] tmp2 = [] for i in range(len(lists)): if i == 0: tmp1.append('film_title') tmp2.append(lists[i][5:]) elif i == 1: tmp1.append('caption_time') tmp2.append(lists[i][6:]) elif i == 2: tmp1.append('film_duration') tmp2.append(lists[i][15:]) elif i == 3: tmp1.append('film_scope') tmp2.append(lists[i][15:]) elif i == 4: tmp1.append('film_type') tmp2.append(lists[i][15:]) elif i == 5: tmp1.append('language_version') tmp2.append(lists[i][15:]) elif i == 6: tmp1.append('color_space') tmp2.append(lists[i][15:]) elif i == 7: tmp1.append('film_track') tmp2.append(lists[i][15:]) elif i == 8: tmp1.append('macro_format') tmp2.append(lists[i][15:]) elif i == 9: tmp1.append('dcp_size') tmp2.append(lists[i][15:]) elif i == 10: tmp1.append('dcp_name') tmp2.append(lists[i][15:]) elif i == 11: tmp1.append('long_biao') tmp2.append("暂无") else: pass result_dict = dict(zip(tmp1, tmp2)) self.mysql.dealData(result_dict) for k, v in result_dict.items(): print k, v def subString(self, filmTitle): """ 去掉电影名称中的中括号[] """ film_name = "" if filmTitle.find('[') != -1: index = filmTitle.index('[') film_name = filmTitle[:index] else: film_name = filmTitle return film_name def main(self): self.getContents(1) # for i in range(26): # self.getContents(i+1) if __name__ == '__main__': rov = ROV() rov.main()