Mofanpy-scraping
source: https://mofanpy.com/tutorials/data-manipulation/scraping/
from urllib.request import urlopen
import re
# if has Chinese, apply decode(), html is not
# all utf-8???
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
res = re.findall(r"<title>(.+?)</title>", html)
print('\nPage title is: ', res[0]);
res = re.findall(r"<p>(.+?)</p>", html, flags = re.DOTALL)
# re.DOTALL if such multi line as tab, new line
print('\nPage paragraph is: ', res[0])
res = re.findall(r'href="(.+?)"', html)
print('\nAll links: \n', res)
from bs4 import BeautifulSoup
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
print('\n', soup.p)
all_href = soup.find_all('a') # attrs?
all_href = [l['href'] for l in all_href]
# l['href'] get the value of href in `a` tags, like array
print('\n', all_href)
CSS
from bs4 import BeautifulSoup
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')
print(html)
soup = BeautifulSoup(html, features='lxml')
# use class to narrow search
month = soup.find_all('li', {"class": "month"})
for m in month:
print(m.get_text())
jan = soup.find('ul', {"class": 'jan'})
d_jan = jan.find_all('li') # use jan as a parent
for d in d_jan:
print(d.get_text())
Regex
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/table.html").read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
for link in course_links:
print(link['href'])
百度百科
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random
base_url = "https://baike.baidu.com"
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]
for i in range(10):
url = base_url + his[-1]
try:
html = urlopen(url, timeout=1).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
print(i, soup.find('h1').get_text(), '; url: ', his[-1])
sub_urls = soup.find_all('a', {'target': '_blank', 'href': re.compile('^/item/(%.{2})+$')})
# /item/(%.{2})+$
# /item/\w+?
if len(sub_urls) !=0:
his.append(random.sample(sub_urls, 1)[0]\['href'])
else:
his.pop()
except Exception as e:
print('Error:'+url+'ERROR!!!!')
pass
continue
print(his)
request
import requests
import webbrowser
# param = {'q': 'bGZoCg'}
# r = requests.get('http://www.google.com/search', params=param)
# print(r.url)
# webbrowser.open(r.url)
# https://www.google.com/search?client=firefox-b-d&q=test
# need proxy
# data = {'firstname': '莫烦', 'lastname': '周'}
# r = requests.post('http://pythonscraping.com/files/processing.php', data=data)
# print(r.text)
# print(r.url)
# print(r.content)
# print(r.json)
payload = {'username': 'Morvan', 'password': 'password'}
# r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
# print(r.cookies.get_dict())
# r = requests.get('http://pythonscraping.com/pages/cookies/profile.php', cookies=r.cookies)
# print(r.text)
session = requests.Session()
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)
request: http://docs.python-requests.org/en/master/