# coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import os
import time
from english.EnglishLib import *
DownloadedUrls = []
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2526.80 Safari/537.36 Core/1.45.933.400 QQBrowser/9.0.8699.400',
'Accept-Encoding': 'gzip, deflate, sdch'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
proxy = '127.0.0.1:8580' # ¥»¦a¥N²z
# proxy='username:password@123.58.10.36:8080'
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
def GetFreebbs(i, name):
try:
url = "http://" + name + ".freebbs.tw/viewthread.php?action=printable&tid=" + str(i)
print(url)
resp = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False)
if resp.status_code != 200:
print(resp.status_code)
resp.content = "<html>" + str(resp.status_code) + "</html>"
# if resp.text.find("«ü©wªº¥DÃD¤£¦s¦b©Î¤w³Q§R°£©Î¥¿¦b³Q¼f®Ö¡A½Ðªð¦^¡C")!=-1:
if resp.text.find("javascript:history.back()") != -1:
with open("h:/data/" + name + "/" + str(i) + ".txt", "wb") as f:
# f.write(resp.text)
# f.write("not exist")
f.write(bytes('not exist', encoding='UTF-8'))
return
print(resp.url)
with open("h:/data/" + name + "/" + str(i) + ".txt", "wb") as f:
f.write(resp.content)
# ¤U±¬O¤U载图¤ù
bsobj = BeautifulSoup(resp.content, 'lxml')
imgs = bsobj.find_all('img')
for img in imgs:
url = img.get('src')
#¨¾¤î«Î`¤U载¦P¤@图¤ù
if url in DownloadedUrls:
continue
#±Æ°£¦UÏú¯SÉݱ¡úG
if not re.match("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", url):
continue
if url.startswith("/"):
continue
if url.startswith("http://E:"):
continue
if url.startswith("ttp://b301.photo.store.qq.com"):
continue
if not url.startswith("http"):
continue
if url.startswith("http://cimg2.163.com/"):
continue
if url.startswith("http://mail.qq.com/"):
continue
if url.startswith("http://user.freebbs.tw/"):
continue
if url.startswith("seccode.php"):
continue
if url == "http://dajue.freebbs.tw/images/default/top.gif":
continue
if url == "http://dajue.freebbs.tw/images/default/plurk.png":
continue
if url.startswith("http://file:///images"):
continue
if url.startswith("http://xinshidai.forumer.com/styles/SpringTime/"):
continue
if url.endswith("/top.gif"):
continue
if url.endswith("/plurk.gif"):
continue
if url.endswith("/reply.gif"):
continue
if url.endswith("/newtopic.gif"):
continue
if url.endswith("/newtopic.gif"):
continue
if url.endswith("/250SZSA.jpg"):
continue
#开©l¤U载图¤ù
print(url)
try:
resp = requests.get(url, headers=headers, proxies=proxies, timeout=(3, 3))
resp.raise_for_status() # ¦pªG响应状态码¤£¬O 200¡A´N¥D动抛¥XÉݱ`
except requests.RequestException as e:
print(e)
continue
else:
pass
DownloadedUrls.append(url)
filename = os.path.basename(url)
if len(filename) > 256:
continue
filename = filename.replace("?", "_")
filename = filename.replace(":", "_")
filename = filename.replace("*", "_")
# filename = filename.replace("!", "_")
# print(filename )
with open("h:/data/" + name + "/img/" + filename, "wb") as f:
f.write(resp.content)
except requests.exceptions.ConnectionError as e:
print('错误:', e.args)
if True:
name = "xianzhen"
SavePath="h:/data/"
if not os.path.exists(SavePath+ name):
os.mkdir(SavePath + name)
if not os.path.exists(SavePath+ name + "/img"):
os.mkdir(SavePath + name + "/img")
for i in range(1, 20000):
print(i)
GetFreebbs(i, name)
[ ¥»©«³Ì«á¥Ñ §õ¬x§Ó ©ó 2021-4-24 09:56 ½s¿è ]
|
|