-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebScrapingInstructions.cs
More file actions
119 lines (100 loc) · 3.57 KB
/
webScrapingInstructions.cs
File metadata and controls
119 lines (100 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
using UnityEngine;
using UnityEngine.Networking;
using System.Collections;
using System.Collections.Generic;
using System.Text.RegularExpressions;
public class BoatInstructionScraper : MonoBehaviour
{
private string[] attemptedURLs = new string[]
{
"https://www.wikihow.com/Build-a-Flat-Bottomed-Boat",
"https://www.instructables.com/How-to-Build-a-Simple-Wooden-Boat/",
"https://www.popularmechanics.com/home/how-to/a6244/how-to-build-a-boat/",
"https://www.diynetwork.com/how-to/skills-and-know-how/carpentry-and-woodworking/how-to-build-a-wooden-boat"
};
private List<string> scrapedInstructions = new List<string>();
public IEnumerator AttemptScrape(string url)
{
Debug.LogWarning("Attempting web scrape (this probably won't work)...");
UnityWebRequest request = UnityWebRequest.Get(url);
request.SetRequestHeader("User-Agent", "Mozilla/5.0");
yield return request.SendWebRequest();
if (request.result != UnityWebRequest.Result.Success)
{
Debug.LogError($"Scrape failed: {request.error}");
Debug.LogError("Reason: Quest doesn't handle web requests well, sites block bots, CORS issues");
yield break;
}
string html = request.downloadHandler.text;
TryParseWikiHow(html);
TryParseInstructables(html);
TryParseGenericHTML(html);
if (scrapedInstructions.Count == 0)
{
Debug.LogError("Parsing failed. HTML structure didn't match expected patterns.");
}
}
void TryParseWikiHow(string html)
{
MatchCollection steps = Regex.Matches(html,
@"<div[^>]*class=[""']step[""'][^>]*>(.*?)</div>",
RegexOptions.Singleline);
foreach (Match match in steps)
{
string stepText = StripHTML(match.Groups[1].Value);
if (stepText.Length > 30)
{
scrapedInstructions.Add(stepText);
}
}
}
void TryParseInstructables(string html)
{
MatchCollection steps = Regex.Matches(html,
@"<div[^>]*class=[""']step-body[""'][^>]*>(.*?)</div>",
RegexOptions.Singleline);
foreach (Match match in steps)
{
string stepText = StripHTML(match.Groups[1].Value);
if (stepText.Length > 30)
{
scrapedInstructions.Add(stepText);
}
}
}
void TryParseGenericHTML(string html)
{
MatchCollection listItems = Regex.Matches(html, @"<li>(.*?)</li>", RegexOptions.Singleline);
foreach (Match match in listItems)
{
string text = StripHTML(match.Groups[1].Value);
if (text.Length > 20 && text.Length < 500)
{
scrapedInstructions.Add(text);
}
}
}
string StripHTML(string html)
{
string text = Regex.Replace(html, @"<[^>]+>", " ");
text = System.Net.WebUtility.HtmlDecode(text);
text = Regex.Replace(text, @"\s+", " ");
return text.Trim();
}
public string[] GetScrapedInstructions()
{
if (scrapedInstructions.Count == 0)
{
Debug.LogWarning("Scraping didn't work. Using hardcoded fallback.");
}
return scrapedInstructions.ToArray();
}
void TestAllURLs()
{
foreach (string url in attemptedURLs)
{
StartCoroutine(AttemptScrape(url));
}
}
}
// We tried :(