Webscraping: verschil tussen versies

Uit De Vliegende Brigade
Naar navigatie springen Naar zoeken springen
Regel 29: Regel 29:
 
* requests - Op zichzelfstaand pakket. ''Niet'' onderdeel van een van de andere pakketten! - Vermoedelijk het beste pakket [https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-and-requests-module]. Zie [[Requests (Python)]].
 
* requests - Op zichzelfstaand pakket. ''Niet'' onderdeel van een van de andere pakketten! - Vermoedelijk het beste pakket [https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-and-requests-module]. Zie [[Requests (Python)]].
  
== Voorbeeld NewEgg ==
+
== Voorbeeld NewEgg - Oorspronkelijk ==
  
[https://www.youtube.com/watch?v=XQgXKtPSzUI]
+
Deze code sluit het meest aan op [https://www.youtube.com/watch?v=XQgXKtPSzUI deze tutorial], inclusief gebruik van ''urllib'' en aliassen.
  
 
=== Doel ===
 
=== Doel ===
Regel 301: Regel 301:
 
i=0
 
i=0
  
 +
for c in cs:
 +
print("\n\n>>>>>>>>> Volgende element")
 +
i=i+1
 +
print(i)
 +
 +
if (c.find(class_="item-brand") is not None):
 +
c_brand = c.find(class_="item-brand").img['alt']
 +
print ("Brand: "+c_brand)
 +
 +
if (c.find(class_="item-title") is not None):
 +
c_name = c.find(class_="item-title").text
 +
print("Name: "+c_name)
 +
 +
if (c.find(class_="price-current") is not None):
 +
c_price = c.find(class_="price-current").strong.text
 +
print ("Price: "+c_price)
 +
</pre>
 +
 +
== Voorbeeld NewEgg - Aangepast ==
 +
 +
* Met ''requests'' ipv. ''urllib''
 +
* Zonder aliassen
 +
* Beetje korter.
 +
 +
<pre>
 +
#! /usr/bin/python3
 +
#
 +
# Newegg webcrawling-example - Refactored
 +
###################################################################
 +
#
 +
print ("\n\n\n")
 +
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
 +
print (">>> 105-Newegg-refactored.py")
 +
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
 +
print ("\n\n")
 +
 +
 +
###################################################################
 +
# Load libraries
 +
###################################################################
 +
#
 +
# Beautiful Soup
 +
################
 +
#
 +
from bs4 import BeautifulSoup
 +
 +
# Webclient
 +
################
 +
#
 +
import requests
 +
 +
 +
###################################################################
 +
# Fetch a webpage
 +
###################################################################
 +
#
 +
# The page we want to scrape
 +
############################
 +
#
 +
url = 'https://www.newegg.com/global/nl-en/p/pl?d=graphics+card'
 +
 +
# Download the page to text object p
 +
#########################################
 +
#
 +
# * type(requests.get(url): <class 'requests.models.Response'>
 +
# * Attribute "text" converts this directly to a string
 +
#
 +
p = requests.get(url).text
 +
 +
 +
###################################################################
 +
# Process the webpage
 +
###################################################################
 +
#
 +
# Parse this object as an html-object (and not like e.g., an XML-
 +
# or FTP-object)
 +
#
 +
p_soup = BeautifulSoup(p, "html.parser")
 +
 +
# Wat voor klasse is p_soup?
 +
############################
 +
#
 +
type(p_soup)
 +
#
 +
# → <class 'bs4.BeautifulSoup'>
 +
 +
# Wat voor methodes heeft p_soup?
 +
#################################
 +
#
 +
dir(p_soup)
 +
#
 +
# Reply:
 +
#
 +
# ['ASCII_SPACES', 'DEFAULT_BUILDER_FEATURES', 'NO_PARSER_SPECIFIED_WARNING',
 +
# 'ROOT_TAG_NAME', '__bool__', '__call__', '__class__', '__contains__', '__copy__',
 +
# '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__',
 +
# '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__',
 +
# '__getstate__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__len__',
 +
# '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__',
 +
# '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__',
 +
# '__subclasshook__', '__unicode__', '__weakref__', '_all_strings',
 +
# '_check_markup_is_url', '_feed', '_find_all', '_find_one', '_is_xml',
 +
# '_lastRecursiveChild', '_last_descendant', '_linkage_fixer',
 +
# '_most_recent_element', '_namespaces', '_popToTag', '_should_pretty_print',
 +
# 'append', 'attrs', 'builder', 'can_be_empty_element', 'cdata_list_attributes',
 +
# 'childGenerator', 'children', 'clear', 'contains_replacement_characters',
 +
# 'contents', 'currentTag', 'current_data', 'declared_html_encoding', 'decode',
 +
# 'decode_contents', 'decompose', 'descendants', 'encode', 'encode_contents',
 +
# 'endData', 'extend', 'extract', 'fetchNextSiblings', 'fetchParents',
 +
# 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAll', 'findAllNext',
 +
# 'findAllPrevious', 'findChild', 'findChildren', 'findNext', 'findNextSibling',
 +
# 'findNextSiblings', 'findParent', 'findParents', 'findPrevious',
 +
# 'findPreviousSibling', 'findPreviousSiblings', 'find_all', 'find_all_next',
 +
# 'find_all_previous', 'find_next', 'find_next_sibling', 'find_next_siblings',
 +
# 'find_parent', 'find_parents', 'find_previous', 'find_previous_sibling',
 +
# 'find_previous_siblings', 'format_string', 'formatter_for_name', 'get', 'getText',
 +
# 'get_attribute_list', 'get_text', 'handle_data', 'handle_endtag', 'handle_starttag',
 +
# 'has_attr', 'has_key', 'hidden', 'index', 'insert', 'insert_after', 'insert_before',
 +
# 'isSelfClosing', 'is_empty_element', 'is_xml', 'known_xml', 'markup', 'name',
 +
# 'namespace', 'new_string', 'new_tag', 'next', 'nextGenerator', 'nextSibling',
 +
# 'nextSiblingGenerator', 'next_element', 'next_elements', 'next_sibling',
 +
# 'next_siblings', 'object_was_parsed', 'original_encoding', 'parent',
 +
# 'parentGenerator', 'parents', 'parse_only', 'parserClass', 'parser_class',
 +
# 'popTag', 'prefix', 'preserve_whitespace_tag_stack', 'preserve_whitespace_tags',
 +
# 'prettify', 'previous', 'previousGenerator', 'previousSibling',
 +
# 'previousSiblingGenerator', 'previous_element', 'previous_elements',
 +
# 'previous_sibling', 'previous_siblings', 'pushTag', 'recursiveChildGenerator',
 +
# 'renderContents', 'replaceWith', 'replaceWithChildren', 'replace_with',
 +
# 'replace_with_children', 'reset', 'select', 'select_one', 'setup', 'smooth',
 +
#'string', 'strings', 'stripped_strings', 'tagStack', 'text', 'unwrap', 'wrap']
 +
 +
 +
# Try out some stuff...
 +
############################
 +
#
 +
p_soup.h1 # → <h1 class="page-title-text">"graphics card"</h1>
 +
p_soup.p  # First p-tag
 +
p_soup.meta # First meta tag
 +
p_soup.body # Gewoon, de body van de pagina :)
 +
p_soup.body.span # First span-tag
 +
 +
 +
##############################################################
 +
# OK - Have a closer look at cs[10]
 +
##############################################################
 +
#
 +
# cs = p_soup.findAll("div",{"class":"item-container"})
 +
 +
# type(cs) # <class 'bs4.element.ResultSet'>
 +
# len(cs) # Aantal elementen = 40
 +
 +
# * This 10th item is a good example, as it has a price (not all items have prices)
 +
#
 +
# c=cs[10]
 +
# type(c)  # bs4.element.tag
 +
 +
# Fetch brand
 +
############################################
 +
#
 +
# Like this:
 +
#
 +
# <div class="item-info"> »
 +
# <a
 +
#    class="item-brand"
 +
#    href="https://www.newegg.com/global/nl-en/GIGABYTE/BrandStore/ID-1314">
 +
#    <img
 +
#      alt="GIGABYTE"
 +
#      src="//c1.neweggimages.com/Brandimage_70x28//Brand1314.gif"
 +
#      title="GIGABYTE"/>
 +
# </a>
 +
#
 +
# c_brand = c.find(class_="item-brand").img['alt']
 +
# print ("c10 - Brand: "+c_brand)
 +
 +
# # Product name
 +
# ############################################
 +
# #
 +
# c_name = c.find(class_="item-title").text
 +
# print("c10 - Name: "+c_name)
 +
 +
# # Fetch price
 +
# ############################################
 +
# #
 +
# c_price = c.find(class_="price-current").strong.text
 +
# print ("c10 - Price: "+c_price)
 +
 +
 +
##############################################################
 +
# Iterate over items in cs
 +
##############################################################
 +
#
 +
# Create a resultset with all "item-container" div classes
 +
##########################################################
 +
#
 +
# * This is actually plain HTML code
 +
# * "div" has one argument-value-pair (or whatever its called).
 +
#  That's included here as a dictionary: {"argument":"value"}
 +
#
 +
print (">>> Create resultset cs...")
 +
 +
cs = p_soup.findAll("div",{"class":"item-container"})
 +
 +
type(cs) # <class 'bs4.element.ResultSet'>
 +
len(cs) # Aantal elementen = 40
 +
i=0
 +
 +
for c in cs:
 +
print("\n\n>>>>>>>>> Volgende element")
 +
i=i+1
 +
print(i)
 +
 +
if (c.find(class_="item-brand") is not None):
 +
c_brand = c.find(class_="item-brand").img['alt']
 +
print ("Brand: "+c_brand)
 +
 +
if (c.find(class_="item-title") is not None):
 +
c_name = c.find(class_="item-title").text
 +
print("Name: "+c_name)
 +
 +
if (c.find(class_="price-current") is not None):
 +
c_price = c.find(class_="price-current").strong.text
 +
print ("Price: "+c_price)
 +
</pre>
 +
 +
== Voorbeeld NewEgg - Compact ==
 +
 +
<pre>
 +
#! /usr/bin/python3
 +
#
 +
# Newegg webcrawling-example - Compact
 +
###################################################################
 +
#
 +
print ("\n\n\n")
 +
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
 +
print (">>> 107-Newegg-compact.py")
 +
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
 +
print ("\n\n")
 +
 +
 +
###################################################################
 +
# Load libraries
 +
###################################################################
 +
#
 +
from bs4 import BeautifulSoup
 +
import requests
 +
 +
 +
###################################################################
 +
# Fetch webpage
 +
###################################################################
 +
#
 +
url = 'https://www.newegg.com/global/nl-en/p/pl?d=graphics+card'
 +
p_html = requests.get(url).text
 +
p_soup = BeautifulSoup(p_html, "html.parser")
 +
 +
 +
###################################################################
 +
# Process webpage
 +
###################################################################
 +
#
 +
cs = p_soup.findAll("div",{"class":"item-container"})
 +
 +
i=0
 
for c in cs:
 
for c in cs:
 
print("\n\n>>>>>>>>> Volgende element")
 
print("\n\n>>>>>>>>> Volgende element")

Versie van 5 aug 2019 08:53

Hoera! Webscraping! Dit is echt heel leuk!

Context

Wat ik gebruik:

  • Webclient: requests
  • HTML-parser: lxml [1]
  • Bewerken van de parse-tree: Beautiful Soup

Casussen

  • WooCrack Kopiëeren: Hoe kun je WooCrack kopiëren, inclusief de downloads? Dit geldt voor de situatie dat je over inlogcodes beschikt
  • Price bot: Hoe kan ik met een script voor bepaalde producten, de prijzen van de concurrent zien? In dit geval zou ik vantevoren de URL's van de websites van concurrenten verzamelen
  • Alle koolborstel-info: Kan ik voor een klant die in koolborstels handelt, alle koolborstels te wereld downloaden, inclusief alle gerelateerde data?

Vraagstukken

  • Hoe kun je sites scrapen waar je eerst moet inloggen? Paar extra stappen die je met de webclient doorloopt? → Zie voorbeeld requests.
  • Hoe download je bestanden, zoals van WooCrack?

Webclients

Om te interacteren met een webserver, heb je een webclient nodig. De gebruikelijke pakketten in Python hiervoor:

  • urllib
  • urllib2
  • urllib3
  • requests - Op zichzelfstaand pakket. Niet onderdeel van een van de andere pakketten! - Vermoedelijk het beste pakket [2]. Zie Requests (Python).

Voorbeeld NewEgg - Oorspronkelijk

Deze code sluit het meest aan op deze tutorial, inclusief gebruik van urllib en aliassen.

Doel

Lijst met

  • Productitels
  • SKU's (indien beschikbaar)
  • EAN-codes (indien beschikbaar)
  • Prijzen.

Script

#! /usr/bin/python3
#
# Newegg webcrawling-example - Data Science Dojo
###################################################################
#
# Source: https://www.youtube.com/watch?v=XQgXKtPSzUI
#
# Goals
#######
#
# Create a list with the following info per product:
#
# * Brand
# * Title
# * Price
# * SKU (if available)
# * EAN-code (if available)
#
print ("\n\n\n")
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print (">>> 100-Newegg.py")
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print ("\n\n")


###################################################################
# Load libraries
###################################################################
#
# Beautiful Soup
################
#
# * For processing websites; the actual crawling
# * Only "BeatitfulSoup" is imported from bs4
# * "Soup" functions like an alias
#
from bs4 import BeautifulSoup as soup

# Webclient
################
#
# * From urllib, only urlopen from request is needed
# * "uReq" works like an alias
#
from urllib.request import urlopen as uReq


###################################################################
# Fetch a webpage
###################################################################
#
# The page we want to scrape
############################
#
my_url = 'https://www.newegg.com/global/nl-en/p/pl?d=graphics+card'

# Download the page to object p
#########################################
#
p = uReq(my_url)

# What kind of object is this?
##############################
#
# print(type(p))
#
# Reply:
#
# <class 'http.client.HTTPResponse'>

# Welke methodes heeft dit object?
##################################
#
dir(p)
#
# Reply:
#
# ['__abstractmethods__', '__class__', '__del__', '__delattr__', '__dict__', 
# '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', 
# '__getattribute__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', 
# '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', 
# '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', 
# '__subclasshook__', '_abc_cache', '_abc_negative_cache', 
# '_abc_negative_cache_version', '_abc_registry', '_checkClosed', 
# '_checkReadable', '_checkSeekable', '_checkWritable', '_check_close', 
# '_close_conn', '_get_chunk_left', '_method', '_peek_chunked', 
# '_read1_chunked', '_read_and_discard_trailer', '_read_next_chunk_size', 
# '_read_status', '_readall_chunked', '_readinto_chunked', '_safe_read', 
# '_safe_readinto', 'begin', 'chunk_left', 'chunked', 'close', 'closed', 
# 'code', 'debuglevel', 'detach', 'fileno', 'flush', 'fp', 'getcode', 
# 'getheader', 'getheaders', 'geturl', 'headers', 'info', 'isatty', 
# 'isclosed', 'length', 'msg', 'peek', 'read', 'read1', 'readable', 'readinto', 
# 'readinto1', 'readline', 'readlines', 'reason', 'seek', 'seekable', 'status', 
# 'tell', 'truncate', 'url', 'version', 'will_close', 'writable', 'write', 
# 'writelines']

# Stop de eigenlijke content in een variable
############################################
#
p_html = p.read()

# Wat type variable is dit geworden? → byte
###########################################
#
type(p_html)
#
# Reply: 
#
# <class 'bytes'>
# Reason that this is 'bytes' and not e.g., 'text': A page can contain mixes
# text/binary content 

# Sluit de connectie
####################
#
# Why HTML is a stateless protocol, isn't it? Whatever
#
p.close()


###################################################################
# Process the webpage
###################################################################
#
# Parse this object as an html-object (and not like e.g., an XML-
# or FTP-object)
#
p_soup = soup(p_html, "html.parser")

# Wat voor klasse is p_soup?
############################
#
type(p_soup)
#
# → <class 'bs4.BeautifulSoup'>

# Wat voor methodes heeft p_soup?
#################################
#
dir(p_soup)
#
# Reply:
#
# ['ASCII_SPACES', 'DEFAULT_BUILDER_FEATURES', 'NO_PARSER_SPECIFIED_WARNING', 
# 'ROOT_TAG_NAME', '__bool__', '__call__', '__class__', '__contains__', '__copy__', 
# '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', 
# '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', 
# '__getstate__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__len__', 
# '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', 
# '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', 
# '__subclasshook__', '__unicode__', '__weakref__', '_all_strings', 
# '_check_markup_is_url', '_feed', '_find_all', '_find_one', '_is_xml', 
# '_lastRecursiveChild', '_last_descendant', '_linkage_fixer', 
# '_most_recent_element', '_namespaces', '_popToTag', '_should_pretty_print', 
# 'append', 'attrs', 'builder', 'can_be_empty_element', 'cdata_list_attributes', 
# 'childGenerator', 'children', 'clear', 'contains_replacement_characters', 
# 'contents', 'currentTag', 'current_data', 'declared_html_encoding', 'decode', 
# 'decode_contents', 'decompose', 'descendants', 'encode', 'encode_contents', 
# 'endData', 'extend', 'extract', 'fetchNextSiblings', 'fetchParents', 
# 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAll', 'findAllNext', 
# 'findAllPrevious', 'findChild', 'findChildren', 'findNext', 'findNextSibling', 
# 'findNextSiblings', 'findParent', 'findParents', 'findPrevious', 
# 'findPreviousSibling', 'findPreviousSiblings', 'find_all', 'find_all_next', 
# 'find_all_previous', 'find_next', 'find_next_sibling', 'find_next_siblings', 
# 'find_parent', 'find_parents', 'find_previous', 'find_previous_sibling', 
# 'find_previous_siblings', 'format_string', 'formatter_for_name', 'get', 'getText', 
# 'get_attribute_list', 'get_text', 'handle_data', 'handle_endtag', 'handle_starttag', 
# 'has_attr', 'has_key', 'hidden', 'index', 'insert', 'insert_after', 'insert_before', 
# 'isSelfClosing', 'is_empty_element', 'is_xml', 'known_xml', 'markup', 'name', 
# 'namespace', 'new_string', 'new_tag', 'next', 'nextGenerator', 'nextSibling', 
# 'nextSiblingGenerator', 'next_element', 'next_elements', 'next_sibling', 
# 'next_siblings', 'object_was_parsed', 'original_encoding', 'parent', 
# 'parentGenerator', 'parents', 'parse_only', 'parserClass', 'parser_class', 
# 'popTag', 'prefix', 'preserve_whitespace_tag_stack', 'preserve_whitespace_tags', 
# 'prettify', 'previous', 'previousGenerator', 'previousSibling', 
# 'previousSiblingGenerator', 'previous_element', 'previous_elements', 
# 'previous_sibling', 'previous_siblings', 'pushTag', 'recursiveChildGenerator', 
# 'renderContents', 'replaceWith', 'replaceWithChildren', 'replace_with', 
# 'replace_with_children', 'reset', 'select', 'select_one', 'setup', 'smooth', 
#'string', 'strings', 'stripped_strings', 'tagStack', 'text', 'unwrap', 'wrap']


# Try out some stuff...
############################
#
p_soup.h1			# → <h1 class="page-title-text">"graphics card"</h1>
p_soup.p   			# First p-tag
p_soup.meta 		# First meta tag
p_soup.body			# Gewoon, de body van de pagina :)
p_soup.body.span	# First span-tag


##############################################################
# OK - Have a closer look at cs[10]
##############################################################
#
# cs = p_soup.findAll("div",{"class":"item-container"})

# type(cs)	# <class 'bs4.element.ResultSet'>
# len(cs)		# Aantal elementen = 40

# * This 10th item is a good example, as it has a price (not all items have prices)
#
# c=cs[10]
# type(c)   # bs4.element.tag

# Fetch brand
############################################
#
# Like this:
#
# <div class="item-info"> » 
# <a 
#    class="item-brand" 
#    href="https://www.newegg.com/global/nl-en/GIGABYTE/BrandStore/ID-1314">
#    <img 
#       alt="GIGABYTE" 
#       src="//c1.neweggimages.com/Brandimage_70x28//Brand1314.gif" 
#       title="GIGABYTE"/>
# </a>
#
# c_brand = c.find(class_="item-brand").img['alt']
# print ("c10 - Brand: "+c_brand)

# # Product name
# ############################################
# #
# c_name = c.find(class_="item-title").text
# print("c10 - Name: "+c_name)

# # Fetch price
# ############################################
# #
# c_price = c.find(class_="price-current").strong.text
# print ("c10 - Price: "+c_price)


##############################################################
# Iterate over items in cs
##############################################################
#
# Create a resultset with all "item-container" div classes
##########################################################
#
# * This is actually plain HTML code
# * "div" has one argument-value-pair (or whatever its called). 
#   That's included here as a dictionary: {"argument":"value"}
#
print (">>> Create resultset cs...")

cs = p_soup.findAll("div",{"class":"item-container"})

type(cs)	# <class 'bs4.element.ResultSet'>
len(cs)		# Aantal elementen = 40
i=0

for c in cs:
	print("\n\n>>>>>>>>> Volgende element")
	i=i+1
	print(i)

	if (c.find(class_="item-brand") is not None):
		c_brand = c.find(class_="item-brand").img['alt']
		print ("Brand: "+c_brand)

	if (c.find(class_="item-title") is not None):
		c_name = c.find(class_="item-title").text
		print("Name: "+c_name)

	if (c.find(class_="price-current") is not None):
		c_price = c.find(class_="price-current").strong.text
		print ("Price: "+c_price)

Voorbeeld NewEgg - Aangepast

  • Met requests ipv. urllib
  • Zonder aliassen
  • Beetje korter.
#! /usr/bin/python3
#
# Newegg webcrawling-example - Refactored
###################################################################
#
print ("\n\n\n")
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print (">>> 105-Newegg-refactored.py")
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print ("\n\n")


###################################################################
# Load libraries
###################################################################
#
# Beautiful Soup
################
#
from bs4 import BeautifulSoup

# Webclient
################
#
import requests


###################################################################
# Fetch a webpage
###################################################################
#
# The page we want to scrape
############################
#
url = 'https://www.newegg.com/global/nl-en/p/pl?d=graphics+card'

# Download the page to text object p
#########################################
#
# * type(requests.get(url): <class 'requests.models.Response'>
# * Attribute "text" converts this directly to a string
#
p = requests.get(url).text


###################################################################
# Process the webpage
###################################################################
#
# Parse this object as an html-object (and not like e.g., an XML-
# or FTP-object)
#
p_soup = BeautifulSoup(p, "html.parser")

# Wat voor klasse is p_soup?
############################
#
type(p_soup)
#
# → <class 'bs4.BeautifulSoup'>

# Wat voor methodes heeft p_soup?
#################################
#
dir(p_soup)
#
# Reply:
#
# ['ASCII_SPACES', 'DEFAULT_BUILDER_FEATURES', 'NO_PARSER_SPECIFIED_WARNING', 
# 'ROOT_TAG_NAME', '__bool__', '__call__', '__class__', '__contains__', '__copy__', 
# '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', 
# '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', 
# '__getstate__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__len__', 
# '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', 
# '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', 
# '__subclasshook__', '__unicode__', '__weakref__', '_all_strings', 
# '_check_markup_is_url', '_feed', '_find_all', '_find_one', '_is_xml', 
# '_lastRecursiveChild', '_last_descendant', '_linkage_fixer', 
# '_most_recent_element', '_namespaces', '_popToTag', '_should_pretty_print', 
# 'append', 'attrs', 'builder', 'can_be_empty_element', 'cdata_list_attributes', 
# 'childGenerator', 'children', 'clear', 'contains_replacement_characters', 
# 'contents', 'currentTag', 'current_data', 'declared_html_encoding', 'decode', 
# 'decode_contents', 'decompose', 'descendants', 'encode', 'encode_contents', 
# 'endData', 'extend', 'extract', 'fetchNextSiblings', 'fetchParents', 
# 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAll', 'findAllNext', 
# 'findAllPrevious', 'findChild', 'findChildren', 'findNext', 'findNextSibling', 
# 'findNextSiblings', 'findParent', 'findParents', 'findPrevious', 
# 'findPreviousSibling', 'findPreviousSiblings', 'find_all', 'find_all_next', 
# 'find_all_previous', 'find_next', 'find_next_sibling', 'find_next_siblings', 
# 'find_parent', 'find_parents', 'find_previous', 'find_previous_sibling', 
# 'find_previous_siblings', 'format_string', 'formatter_for_name', 'get', 'getText', 
# 'get_attribute_list', 'get_text', 'handle_data', 'handle_endtag', 'handle_starttag', 
# 'has_attr', 'has_key', 'hidden', 'index', 'insert', 'insert_after', 'insert_before', 
# 'isSelfClosing', 'is_empty_element', 'is_xml', 'known_xml', 'markup', 'name', 
# 'namespace', 'new_string', 'new_tag', 'next', 'nextGenerator', 'nextSibling', 
# 'nextSiblingGenerator', 'next_element', 'next_elements', 'next_sibling', 
# 'next_siblings', 'object_was_parsed', 'original_encoding', 'parent', 
# 'parentGenerator', 'parents', 'parse_only', 'parserClass', 'parser_class', 
# 'popTag', 'prefix', 'preserve_whitespace_tag_stack', 'preserve_whitespace_tags', 
# 'prettify', 'previous', 'previousGenerator', 'previousSibling', 
# 'previousSiblingGenerator', 'previous_element', 'previous_elements', 
# 'previous_sibling', 'previous_siblings', 'pushTag', 'recursiveChildGenerator', 
# 'renderContents', 'replaceWith', 'replaceWithChildren', 'replace_with', 
# 'replace_with_children', 'reset', 'select', 'select_one', 'setup', 'smooth', 
#'string', 'strings', 'stripped_strings', 'tagStack', 'text', 'unwrap', 'wrap']


# Try out some stuff...
############################
#
p_soup.h1			# → <h1 class="page-title-text">"graphics card"</h1>
p_soup.p   			# First p-tag
p_soup.meta 		# First meta tag
p_soup.body			# Gewoon, de body van de pagina :)
p_soup.body.span	# First span-tag


##############################################################
# OK - Have a closer look at cs[10]
##############################################################
#
# cs = p_soup.findAll("div",{"class":"item-container"})

# type(cs)	# <class 'bs4.element.ResultSet'>
# len(cs)		# Aantal elementen = 40

# * This 10th item is a good example, as it has a price (not all items have prices)
#
# c=cs[10]
# type(c)   # bs4.element.tag

# Fetch brand
############################################
#
# Like this:
#
# <div class="item-info"> » 
# <a 
#    class="item-brand" 
#    href="https://www.newegg.com/global/nl-en/GIGABYTE/BrandStore/ID-1314">
#    <img 
#       alt="GIGABYTE" 
#       src="//c1.neweggimages.com/Brandimage_70x28//Brand1314.gif" 
#       title="GIGABYTE"/>
# </a>
#
# c_brand = c.find(class_="item-brand").img['alt']
# print ("c10 - Brand: "+c_brand)

# # Product name
# ############################################
# #
# c_name = c.find(class_="item-title").text
# print("c10 - Name: "+c_name)

# # Fetch price
# ############################################
# #
# c_price = c.find(class_="price-current").strong.text
# print ("c10 - Price: "+c_price)


##############################################################
# Iterate over items in cs
##############################################################
#
# Create a resultset with all "item-container" div classes
##########################################################
#
# * This is actually plain HTML code
# * "div" has one argument-value-pair (or whatever its called). 
#   That's included here as a dictionary: {"argument":"value"}
#
print (">>> Create resultset cs...")

cs = p_soup.findAll("div",{"class":"item-container"})

type(cs)	# <class 'bs4.element.ResultSet'>
len(cs)		# Aantal elementen = 40
i=0

for c in cs:
	print("\n\n>>>>>>>>> Volgende element")
	i=i+1
	print(i)

	if (c.find(class_="item-brand") is not None):
		c_brand = c.find(class_="item-brand").img['alt']
		print ("Brand: "+c_brand)

	if (c.find(class_="item-title") is not None):
		c_name = c.find(class_="item-title").text
		print("Name: "+c_name)

	if (c.find(class_="price-current") is not None):
		c_price = c.find(class_="price-current").strong.text
		print ("Price: "+c_price)

Voorbeeld NewEgg - Compact

#! /usr/bin/python3
#
# Newegg webcrawling-example - Compact
###################################################################
#
print ("\n\n\n")
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print (">>> 107-Newegg-compact.py")
print (">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print ("\n\n")


###################################################################
# Load libraries
###################################################################
#
from bs4 import BeautifulSoup
import requests


###################################################################
# Fetch webpage
###################################################################
#
url = 'https://www.newegg.com/global/nl-en/p/pl?d=graphics+card'
p_html = requests.get(url).text
p_soup = BeautifulSoup(p_html, "html.parser")


###################################################################
# Process webpage
###################################################################
#
cs = p_soup.findAll("div",{"class":"item-container"})

i=0
for c in cs:
	print("\n\n>>>>>>>>> Volgende element")
	i=i+1
	print(i)

	if (c.find(class_="item-brand") is not None):
		c_brand = c.find(class_="item-brand").img['alt']
		print ("Brand: "+c_brand)

	if (c.find(class_="item-title") is not None):
		c_name = c.find(class_="item-title").text
		print("Name: "+c_name)

	if (c.find(class_="price-current") is not None):
		c_price = c.find(class_="price-current").strong.text
		print ("Price: "+c_price)

Zie ook

Bronnen

Post (ipv. get)

Request-library