有趣的python爬虫项目
内容摘要
这篇文章主要为大家详细介绍了有趣的python爬虫项目,具有一定的参考价值,可以用来参考一下。
感兴趣有趣的python爬虫项目的小伙伴,下面一起跟随php教程的小编罗X来看看吧。<b
感兴趣有趣的python爬虫项目的小伙伴,下面一起跟随php教程的小编罗X来看看吧。<b
文章正文
这篇文章主要为大家详细介绍了有趣的python爬虫项目,具有一定的参考价值,可以用来参考一下。
感兴趣有趣的python爬虫项目的小伙伴,下面一起跟随php教程的小编罗X来看看吧。<br>
from tkinter import *
import time
import requests
from bs4 import BeautifulSoup
import bs4
import random
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
#r.encoding = r.apparent_encoding
return r.text
except:
print("faile")
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find_all('article'):
if isinstance(tr, bs4.element.Tag):
tds = tr('a')
ulist.append([tds[0].string, tds[1].string, tds[2].string])
def printUnivList(ulist,k):
return ulist[k][0]
def getduanzi():
uinfo = []
k=1
u = 'http://duanziwang.com/category/%E4%B8%80%E5%8F%A5%E8%AF%9D%E6%AE%B5%E5%AD%90/'
i=random.randint(1,49)
url=u+str(i)+'/'
html = getHTMLText(url)
fillUnivList(uinfo, html)
k=random.randint(0,9)
return printUnivList(uinfo,k)
def fill2(ulist,html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find_all('div','article block untagged mb15 typs_hot'):
if isinstance(tr, bs4.element.Tag):
tds = tr.find('div','content')
tdss=tds('span')
reg = re.compile('<[^>]*>')
text=reg.sub('',str(tdss))
regg = re.compile('\\[|\\]|\\n')
text=regg.sub('',text)
ulist.append(text)
def getjoke():
ulist=[]
u='https://www.qiushibaike.com/text/page/'
i=random.randint(1,13)
url=u+str(i)+'/'
html=getHTMLText(url)
fill2(ulist,html)
k=random.randint(0,16)
#print(str(ulist[k]))
return str(ulist[k])
def getHTMLText2(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def geturl(url,text):
html=getHTMLText2(url)
soup = BeautifulSoup(html, "html.parser")
for tds in soup.find_all('a'):
#print(tds.string)
if str(tds.string) in text:
#print(tds.attrs['href'])
return tds.attrs['href']
return ''
def getmoreurl(url):
html=getHTMLText2(url)
soup = BeautifulSoup(html, "html.parser")
try:
li=soup.find_all('li',attrs={'class':'articleTitle fl'})
k=random.randint(0,len(li)-1)
return li[k]('a')[0].attrs['href']
except:
return ''
def getsen(url):
html=getHTMLText2(url)
soup = BeautifulSoup(html, "html.parser")
try:
li=soup.find('p')
reg = re.compile('<p>.*')
l=reg.findall(str(li))
#print(l)
k=random.randint(0,len(l)-1)
text=re.findall('[\u4e00-\u9fa5]+',l[k])
x=''
if len(text):
for t in text[:-2]:
x=x+t+','
return x+text[-1]+'。'
else:
return "不好意思,出了点小问题,请重试!"
except:
return "抱歉,没找到你想要的"
def getsentance(text):
start_url = 'http://www.siandian.com'
urll='http://www.siandian.com/tags.html'
end1=geturl(urll,text)
if end1=='':
return "抱歉,没有找到你想要的。"
else:
end2=getmoreurl(start_url+end1)
if end2=='':
return "抱歉,没有找到你想要的。"
else:
#print(start_url+end2)
return getsen(start_url+end2)
def xiaotang(s):
sign=1;
while(sign):
if '段子' in s:
while('段子' in s or '继续' in s or '再来' in s or s==''):
return getduanzi()
elif '笑话' in s:
while('笑话' in s or '继续' in s or '再来' in s or s==''):
return getjoke()
elif '句' in s or '话' in s:
return getsentance(s)
elif '傻子' in s or '草' in s or '日' in s:
return '这是脏话不可以说哦'
elif '二' in s or '垃圾' in s or '傻逼' in s:
t='你是魔鬼吗?'
x=''
for i in range(10):
x=x+t+' !'+'\n'
return x
else:
return "我好像不明白\n"
def main():
def start():
strMsg = '小糖:' + time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime()) + '\n '
txtget.insert(END, strMsg, 'redcolor')
txtget.insert(END, '你好,请问有什么可以帮忙的?')
def sendMsg():#发送消息
t=txtMsg.get('0.0', END)
txtMsg.delete('0.0', END)
strMsg = '我:' + time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime()) + '\n '
for i in range(int(txtget.index(END).split(".")[0])-int(txtMsgList.index(END).split(".")[0])+1):
txtMsgList.insert(END, '\n')
txtMsgList.insert(END, strMsg, 'greencolor')
txtMsgList.insert(END, t)
txtMsgList.see(END)
for i in range(int(txtMsgList.index(END).split(".")[0])-int(txtget.index(END).split(".")[0])+1):
txtget.insert(END, '\n')
txtget.see(END)
strMsg = '小糖:' + time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime()) + '\n '
for i in range(int(txtMsgList.index(END).split(".")[0])-int(txtget.index(END).split(".")[0])+1):
txtget.insert(END, '\n')
txtget.insert(END, strMsg, 'redcolor')
txtget.insert(END, xiaotang(t))
txtget.see(END)
for i in range(int(txtget.index(END).split(".")[0])-int(txtMsgList.index(END).split(".")[0])+1):
txtMsgList.insert(END, '\n')
txtMsgList.see(END)
def cancelMsg():#取消消息
txtMsg.delete('0.0', END)
def sendMsgEvent(event): #发送消息事件
sendMsg()
#创建窗口
t = Tk()
t.title('小糖助手')
#创建frame容器
frmLT = Frame(width=500, height=320, bg='#F19C8B')
frmLC = Frame(width=500, height=150, bg='#F19C8B')
frmLB = Frame(width=500, height=30,bg='white')
frmRT = Frame(width=200, height=500,bg='#F19C8B')
#创建控件
txtMsgList = Text(frmLT,width=40,bd=0)
txtMsgList.tag_config('greencolor', foreground='#008C00')#创建tag
txtMsg = Text(frmLC)
txtget = Text(frmLT,width=40,bd=0)
txtget.tag_config('redcolor', foreground='#DC143C')#创建tag
start()
#txtMsg.bind("", sendMsgEvent)
txtMsg.bind('<Return>',sendMsgEvent)
btnSend = Button(frmLB, text='发 送', width = 8, command=sendMsg,bg='#E88384',bd=0)
btnCancel = Button(frmLB, text='取消', width = 8, command=cancelMsg,bg='#F3ADA0',bd=0)
scollor=Scrollbar(bg='white')
scollor.config(command=txtget.yview)
scollor.config(command=txtMsgList.yview)
txtget.config(yscrollcommand=scollor.set)
txtMsgList.config(yscrollcommand=scollor.set)
imgInfo = PhotoImage(file = "aa.png")
lblImage = Label(frmRT, image = imgInfo)
lblImage.image = imgInfo
#窗口布局
frmLT.grid(row=0, column=0, columnspan=2, padx=0, pady=0)
frmLC.grid(row=1, column=0, columnspan=2, padx=0, pady=0)
frmLB.grid(row=2, column=0, columnspan=2,padx=0)
scollor.grid(row=0,column=2,sticky=N+S)
frmRT.grid(row=0, column=3, rowspan=3, padx=0, pady=0)
#固定大小
frmLT.grid_propagate(0)
frmLC.grid_propagate(0)
frmLB.grid_propagate(0)
frmRT.grid_propagate(0)
btnSend.grid(row=2, column=0)
btnCancel.grid(row=2, column=1)
lblImage.grid()
txtget.grid(row=0,column=0)
txtMsgList.grid(row=0,column=1)
txtMsg.grid()
#主事件循环
t.mainloop()
if __name__ == '__main__':
main()
注:关于有趣的python爬虫项目的内容就先介绍到这里,更多相关文章的可以留意
代码注释