|
#!/usr/bin/env python
# coding: utf-8
import base64
import json
from time import strftime, localtime
import urllib2
import requests
from bs4 import BeautifulSoup
from lxml import etree
import time
import mysql
"""
2、 然后获取http://www.rov8.com/index.php/Times/index/p/number.html所有的详情url。(注意是分页)
3、 获取字幕报页面我们想要的信息,然后抓取出来。
4、 将获取的字段存储到mysql中。
所需字段
1、片名 film_title
2 出字幕时间 caption_time
3、影片时长 film_duration
4、影片类型 film_type
5、语言版本 language_version
6、编码格式 macro_format
7、DCP包大小 scp_size
"""
# 模拟浏览器的访问
req_header = {'User-Agent': 'Mozilla/5.0 (Macintosh;'
' Intel Mac OS X 10_11_6) '
'AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/59.0.3071.115 Safari/537.36'}
class ROV:
def __init__(self):
self.base_url = 'http://www.rov8.com/index.php/Times/index/p/'
self.req_header = req_header
self.mysql = mysql.Mysql()
# 获取当前时间
def getCurrentTime(self):
return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time()))
# 获取当前时间
def getCurrentDate(self):
return time.strftime('%Y-%m-%d', time.localtime(time.time()))
def getHtml(self, targetUrl):
req = urllib2.Request(targetUrl, "", headers=req_header)
response = urllib2.urlopen(req)
return response.read().decode('utf-8')
def getContents(self, pageNum):
"""将传来的页码中每一个具体影片的url提取出来"""
url = self.base_url + str(pageNum) + '.html'
html = urllib2.urlopen(url)
#将网页内容转码为utf-8
htmldata = html.read().decode('utf-8')
detailPath = etree.HTML(htmldata)
urls = detailPath.xpath('//div[@class="content-zw"]/ul/li/a/@href')
for url in urls:
self.getDetailPage('http://www.rov8.com/' + url)
def getDetailPage(self, getDetailPageUrl):
"""
通过传来的具体影片的url,获取详情页的主要信息
"""
htmldata = self.getHtml(getDetailPageUrl)
soup = BeautifulSoup(htmldata, 'lxml')
film_title = soup.find('td', attrs={"class": "Atitle"}).get_text()
div = soup.find('div', attrs={"class": "art-cnt"}).get_text().encode("utf-8")
self.getFiled(div, film_title)
def getCaptionTimeByFilmTitle(self, film_name):
"""
不用登录,通过影片名称获取出字幕时间
"""
filmName = self.subString(film_name)
todaytime = strftime("%Y-%m-%d", localtime())
salt = "FYZS_sub|" + filmName + "|" + todaytime
searchcode = base64.encodestring(base64.encodestring(salt))
url = "http://www.rov8.com/fy/API.php?a=rov2018&b=GFYZS&c=skm&d=" + searchcode
r = requests.get(url)
date = base64.decodestring(base64.decodestring(r.text))
info = json.loads(date)[0]
captionTime = info[2][:8]
result = ""
if captionTime[0] != "0":
result = u'别着急,还木有出来呢~~'
else:
result = captionTime
return result
def getFiled(self, div, film_title):
"""
通过传来的参数将所需字段过滤出来
"""
captionTime = self.getCaptionTimeByFilmTitle(film_title)
caption_time = '出字幕时间:'.decode('utf-8') + captionTime
film_name = '影片名称:'.decode('utf-8') + film_title
long_biao = '龙标:'.decode('utf-8') + '暂无'
dcp_name = 'DCP包名称:""'
results = []
datalist = div.splitlines()
for j in range(0, len(datalist)):
str = datalist[j].strip()
if len(str) != 0:
nPos = str.find(' ')
if nPos == -1:
nPos = len(str)
results.append(str[0:nPos])
# 从抓取的div中找到所需的字段值
rlists = []
rlists.append(film_name)
rlists.append(caption_time)
for str in results:
if str.find('影片时长') != -1 and len(str) < 24:
rlists.append(str)
elif str.find('影片类型') != -1:
rlists.append(str)
elif str.find('语言版本') != -1:
if len(str) < 16:
rlists.append('语言版本:无')
else:
rlists.append(str)
elif str.find('编码格式') != -1:
rlists.append(str)
elif str.find('影片画幅') != -1:
rlists.append(str)
elif str.find('色彩空间') != -1:
rlists.append(str)
elif str.find('影片声道') != -1:
rlists.append(str)
elif str.find('DCP包大小') != -1:
rlists.append(str)
break
else:
pass
rlists.append(dcp_name)
rlists.append(long_biao)
print '*' * 80
# for i in rlists:
# print i
self.list2dict(rlists)
def list2dict(self, lists):
"""
将列表转化为字典,方便mysql插入
"""
tmp1 = []
tmp2 = []
for i in range(len(lists)):
if i == 0:
tmp1.append('film_title')
tmp2.append(lists[i][5:])
elif i == 1:
tmp1.append('caption_time')
tmp2.append(lists[i][6:])
elif i == 2:
tmp1.append('film_duration')
tmp2.append(lists[i][15:])
elif i == 3:
tmp1.append('film_scope')
tmp2.append(lists[i][15:])
elif i == 4:
tmp1.append('film_type')
tmp2.append(lists[i][15:])
elif i == 5:
tmp1.append('language_version')
tmp2.append(lists[i][15:])
elif i == 6:
tmp1.append('color_space')
tmp2.append(lists[i][15:])
elif i == 7:
tmp1.append('film_track')
tmp2.append(lists[i][15:])
elif i == 8:
tmp1.append('macro_format')
tmp2.append(lists[i][15:])
elif i == 9:
tmp1.append('dcp_size')
tmp2.append(lists[i][15:])
elif i == 10:
tmp1.append('dcp_name')
tmp2.append(lists[i][15:])
elif i == 11:
tmp1.append('long_biao')
tmp2.append("暂无")
else:
pass
result_dict = dict(zip(tmp1, tmp2))
self.mysql.dealData(result_dict)
for k, v in result_dict.items():
print k, v
def subString(self, filmTitle):
"""
去掉电影名称中的中括号[]
"""
film_name = ""
if filmTitle.find('[') != -1:
index = filmTitle.index('[')
film_name = filmTitle[:index]
else:
film_name = filmTitle
return film_name
def main(self):
self.getContents(1)
# for i in range(26):
# self.getContents(i+1)
if __name__ == '__main__':
rov = ROV()
rov.main()
|