so I am trying to figure out how I can possible scrape a javascript tag using regex which I believe might be the easiest way.
The tag looks like:
<script type="text/javascript"> var spConfig=newApex.Config({ "attributes": { "199": { "id": "199", "code": "legend", "label": "Weapons", "options": [ { "label": "10", "priceInGame": "0", "id": [ ] }, { "label": "10.5", "priceInGame": "0", "id": [ ] }, { "label": "11", "priceInGame": "0", "id": [ "66659" ] }, { "label": "11.5", "priceInGame": "0", "id": [ ] }, { "label": "12", "priceInGame": "0", "id": [ ] }, { "label": "12.5", "priceInGame": "0", "id": [ ] }, { "label": "13", "priceInGame": "0", "id": [ ] }, { "label": "4", "priceInGame": "0", "id": [ ] }, { "label": "4.5", "priceInGame": "0", "id": [ ] }, { "label": "5", "priceInGame": "0", "id": [ ] }, { "label": "5.5", "priceInGame": "0", "id": [ ] }, { "label": "6", "priceInGame": "0", "id": [ ] }, { "label": "6.5", "priceInGame": "0", "id": [ ] }, { "label": "7", "priceInGame": "0", "id": [ ] }, { "label": "7.5", "priceInGame": "0", "id": [ ] }, { "label": "8", "priceInGame": "0", "id": [ "66672" ] }, { "label": "8.5", "priceInGame": "0", "id": [ "66673" ] }, { "label": "9", "priceInGame": "0", "id": [ ] }, { "label": "9.5", "priceInGame": "0", "id": [ "66675" ] } ] } }, "weaponID": "66733", "chooseText": "Apex Legends", "Config": { "includeCoins": false, } }); </script>
and I want to scrape all Label
Whaht I tried to do is:
for nosto_sku_tag in bs4.find_all('script', {'type': 'text/javascript'}): try: test = re.findall('var spConfig = ({.*}?);', nosto_sku_tag.text.strip()) print(test) except: # noqa continue
but it only returned an empty value of []
so I am here asking what can I do to be able to scrape the labels?
Advertisement
Answer
You need to specify the attribute using attr=value
or attrs={'attr': 'value'}
syntax.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#the-keyword-arguments
import json import re from bs4 import BeautifulSoup if __name__ == '__main__': html = ''' <script type="text/javascript"> var spConfig=newApex.Config({ "attributes": { "199": { "id": "199", "code": "legend", "label": "Weapons", "options": [ { "label": "10", "priceInGame": "0", "id": [] }, { "label": "10.5", "priceInGame": "0", "id": [] }, { "label": "11", "priceInGame": "0", "id": [ "66659" ] }, { "label": "7.5", "priceInGame": "0", "id": [] }, { "label": "8", "priceInGame": "0", "id": ["66672"] } ] } }, "weaponID": "66733", "chooseText": "Apex Legends", "taxConfig": { "includeCoins": false, } }); </script> ''' soup = BeautifulSoup(html, 'html.parser') # this one works too # script = soup.find('script', attrs={'type':'text/javascript'}) script = soup.find('script', type='text/javascript') js: str = script.text.replace('n', '') raw_json = re.search('var spConfig=newApex.Config(({.*}));', js, flags=re.MULTILINE).group(1) data = json.loads(raw_json) labels = [opt['label'] for opt in data['attributes']['199']['options']] print(labels)
output:
['10', '10.5', '11', '7.5', '8'] ... some removed for brevity