Python脚本工具,python百度排名查询源码
#百度排名查询
import requests
import re
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win ** ; x ** ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
def cx(keyword,x,cxurl):
if x==1:
x=0
else:
x=str(x)
x=f'{x}0'
print(x)
url=f"https:// ** .baidu.com/s?wd={keyword}&ie=UTF-8&pn={x}"
print(url)
html=requests.get(url,headers=headers).text
#print(html)
sousze=r'<div id="content_left">(.+?)<div style="clear:both;height:0;"></div>'
sous=re.findall(sousze,html,re.S)
#print(sous)
sousjgze=r'<div class="result(.+?)</h3>'
sousjg=re.findall(sousjgze,sous[0],re.S)
#print(sousjg)
#print(len(sousjg))
idze=r'" id="([0-9]{1,4})"'
hrefze=r'''}"
href = "(.+?)"'''
for ssjg in sousjg:
#print(ssjg)
id=re.findall(idze,ssjg,re.S)
if id==[]:
idrze=r'id="([0-9]{1,4})" tpl="'
id = re.findall(idrze, ssjg, re.S)
id=id[0]
#print(id)
href=re.findall(hrefze,ssjg,re.S)
if href==[]:
hrefrze=r'''<h3 class="t c-gap-bottom-s ** ll">
<a href="(.+?)"'''
href = re.findall(hrefrze,ssjg,re.S)
href=href[0]
if "#34; not in href:
href=f' >{href}'
#print(href)
ul=requests.get(href,headers=headers)
zsurl=ul.url
#print(zsurl)
if cxurl in zsurl:
print(id)
#cx("工业设计考研",2," ** .ugainian.com")
def cxpm(keyword,x,cxurl):
url=f"https:// ** .baidu.com/s?wd={keyword}&ie=UTF-8&pn={x}0&rn=50"
print(url)
html=requests.get(url,headers=headers).text
#print(html)
sousze=r'<div id="content_left">(.+?)<div style="clear:both;height:0;"></div>'
sous=re.findall(sousze,html,re.S)
#print(sous)
sousjgze=r'<div class="result(.+?)class="m">百度快照</a></div></div>'
sousjg=re.findall(sousjgze,sous[0],re.S)
#print(sousjg)
#print(len(sousjg))
idze=r'" id="([0-9]{1,4})"'
hrefze=r'<div class="f13"><a target="_blank" href="(.+?)" class="c-showurl"'
#hrefze=r'''}"
#href = "(.+?)"'''
hrefrze=r'''<h3 class="t c-gap-bottom-s ** ll">
<a href="(.+?)"'''
for ssjg in sousjg:
#print(ssjg)
id=re.findall(idze,ssjg,re.S)
if id==[]:
idrze=r'id="([0-9]{1,4})" tpl="'
id = re.findall(idrze, ssjg, re.S)
id=id[0]
#print(id)
href=re.findall(hrefze,ssjg,re.S)
print(href)
if href==[]:
#hrefrze=r'''<h3 class="t c-gap-bottom-s ** ll">
#<a href="(.+?)"'''
href = re.findall(hrefrze,ssjg,re.S)
href=href[0]
if "#34; not in href:
href=f' >{href}'
#print(href)
ul=requests.get(href,headers=headers)
zsurl=ul.url
#print(zsurl)
if cxurl in zsurl:
print(keyword,id,zsurl)
#cxpm('工业设计考研',0," ** .ugainain.com")
import re
a='''aaafg
sfdfgg1224
fssf'''
re_htm=re.findall('aaafg(.+?)fssf',a,re.S)
print(re_htm)
re_htm1=re.findall('aaafgs(.+?)fssf',a,re.S)
print(re_htm1)
re_htm2=re.findall('aaafgs(.+?)sfssf',a,re.S)
print(re_htm2)