[學無止境][Python]用爬蟲批次找出下載網址
科技,始終來自於人性
import requests from bs4 import BeautifulSoup ###進入每篇文章抓網址 DL = [] def get_dl_link(link): global DL dl = [] response = requests.get(link) response.encoding = 'UTF-8' soup = BeautifulSoup(response.text, 'lxml') articles = soup.find_all('div','box-b') for article in articles : meta1 = article.find("h1") if type(meta1) != type(None): title = meta1.getText().replace("[","").replace("]","").replace(" / "," ") print(title) meta2 = article.find('div','entry') if type(meta2) != type(None): poster = meta2.find('p','poster').find('img').get("src") dl += [title] + [poster] for i in meta2.find_all('p','screenshot'): #Screenshot dl += [i.find('img').get("src")] for i in meta2.find_all('a'): #下載網址 if "wushare" in str(i) and ("rar" or "zip" ) not in str(i): dl += [i.get('href')] DL.append(dl) ###Main page = 1 keyword = "" #關鍵字 while True: url = "http://javpop.com/page/" + str(page) +"?s=" + keyword response = requests.get(url) response.encoding = 'UTF-8' soup = BeautifulSoup(response.text, 'lxml') #檢查是不是沒有下一頁 check = soup.find('h2').getText() if check == "Error 404 - Not Found": break articles = soup.find_all('li') for article in articles : # meta = article.find('a') if type(meta) != type(None) and keyword in str(meta): #每篇文章的網址 link = meta.get("href") get_dl_link(link) page += 1 #去下一頁 ###Export filename = "javpop_" + keyword +".csv" with open(filename , "w", encoding = "utf8") as data: for i in DL : for j in i : data.write("%s," % (j)) data.write("\n")
因為每次有大量抓東西的需求,但如果只靠人力的話,最快的方法就是先查看網頁原始碼,再找出下載網址的部分。這樣的方法十分累人,因此試著用Python搭配requests、BeautifulSoup寫了個直接找出下載網址的程式,沒想到研究如何透過標籤找到我要的內容花了我最多時間(苦笑)
匯出畫面
原本是參考PTT爬蟲,但不同的網站好像不能直接套用,需要重新研究原始碼的結構,之後有空再來處理另外一家網站吧~
另外雖然省去了手動找網址的時間,但還是要手動複製到JDownloader2再存成DLC,正在尋找直接將網址存成DLC的方法
爬蟲新手,若觀念有錯或有更好的寫法,敬請賜教!