so I am trying to figure out how I can possible scrape a javascript tag using regex which I believe might be the easiest way.
The tag looks like:
JavaScript
x
152
152
1
<script type="text/javascript">
2
3
var spConfig=newApex.Config({
4
"attributes": {
5
"199": {
6
"id": "199",
7
"code": "legend",
8
"label": "Weapons",
9
"options": [
10
{
11
"label": "10",
12
"priceInGame": "0",
13
"id": [
14
15
]
16
},
17
{
18
"label": "10.5",
19
"priceInGame": "0",
20
"id": [
21
22
]
23
},
24
{
25
"label": "11",
26
"priceInGame": "0",
27
"id": [
28
"66659"
29
]
30
},
31
{
32
"label": "11.5",
33
"priceInGame": "0",
34
"id": [
35
]
36
},
37
{
38
"label": "12",
39
"priceInGame": "0",
40
"id": [
41
42
]
43
},
44
{
45
"label": "12.5",
46
"priceInGame": "0",
47
"id": [
48
]
49
},
50
{
51
"label": "13",
52
"priceInGame": "0",
53
"id": [
54
55
]
56
},
57
{
58
"label": "4",
59
"priceInGame": "0",
60
"id": [
61
62
]
63
},
64
{
65
"label": "4.5",
66
"priceInGame": "0",
67
"id": [
68
69
]
70
},
71
{
72
"label": "5",
73
"priceInGame": "0",
74
"id": [
75
76
]
77
},
78
{
79
"label": "5.5",
80
"priceInGame": "0",
81
"id": [
82
83
]
84
},
85
{
86
"label": "6",
87
"priceInGame": "0",
88
"id": [
89
90
]
91
},
92
{
93
"label": "6.5",
94
"priceInGame": "0",
95
"id": [
96
97
]
98
},
99
{
100
"label": "7",
101
"priceInGame": "0",
102
"id": [
103
104
]
105
},
106
{
107
"label": "7.5",
108
"priceInGame": "0",
109
"id": [
110
111
]
112
},
113
{
114
"label": "8",
115
"priceInGame": "0",
116
"id": [
117
"66672"
118
]
119
},
120
{
121
"label": "8.5",
122
"priceInGame": "0",
123
"id": [
124
"66673"
125
]
126
},
127
{
128
"label": "9",
129
"priceInGame": "0",
130
"id": [
131
132
]
133
},
134
{
135
"label": "9.5",
136
"priceInGame": "0",
137
"id": [
138
"66675"
139
]
140
}
141
]
142
}
143
},
144
"weaponID": "66733",
145
"chooseText": "Apex Legends",
146
"Config": {
147
"includeCoins": false,
148
}
149
});
150
151
</script>
152
and I want to scrape all Label
Whaht I tried to do is:
JavaScript
1
7
1
for nosto_sku_tag in bs4.find_all('script', {'type': 'text/javascript'}):
2
try:
3
test = re.findall('var spConfig = ({.*}?);', nosto_sku_tag.text.strip())
4
print(test)
5
except: # noqa
6
continue
7
but it only returned an empty value of []
so I am here asking what can I do to be able to scrape the labels?
Advertisement
Answer
You need to specify the attribute using attr=value
or attrs={'attr': 'value'}
syntax.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#the-keyword-arguments
JavaScript
1
45
45
1
import json
2
import re
3
4
from bs4 import BeautifulSoup
5
6
if __name__ == '__main__':
7
html = '''
8
<script type="text/javascript">
9
10
var spConfig=newApex.Config({
11
"attributes": {
12
"199": {
13
"id": "199",
14
"code": "legend",
15
"label": "Weapons",
16
"options": [
17
{ "label": "10", "priceInGame": "0", "id": [] },
18
{ "label": "10.5", "priceInGame": "0", "id": [] },
19
{ "label": "11", "priceInGame": "0", "id": [ "66659" ] },
20
{ "label": "7.5", "priceInGame": "0", "id": [] },
21
{ "label": "8", "priceInGame": "0", "id": ["66672"] }
22
]
23
}
24
},
25
"weaponID": "66733",
26
"chooseText": "Apex Legends",
27
"taxConfig": {
28
"includeCoins": false,
29
}
30
});
31
32
</script>
33
'''
34
35
soup = BeautifulSoup(html, 'html.parser')
36
# this one works too
37
# script = soup.find('script', attrs={'type':'text/javascript'})
38
script = soup.find('script', type='text/javascript')
39
js: str = script.text.replace('n', '')
40
raw_json = re.search('var spConfig=newApex.Config(({.*}));', js, flags=re.MULTILINE).group(1)
41
data = json.loads(raw_json)
42
labels = [opt['label'] for opt in data['attributes']['199']['options']]
43
print(labels)
44
45
output:
JavaScript
1
2
1
['10', '10.5', '11', '7.5', '8'] some removed for brevity
2