Meta-AI-Hackathon/Scripts/webScrapingInstructions.cs at main · rishyendra333/Meta-AI-Hackathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
using UnityEngine;
using UnityEngine.Networking;
using System.Collections;
using System.Collections.Generic;
using System.Text.RegularExpressions;

public class BoatInstructionScraper : MonoBehaviour
{
    private string[] attemptedURLs = new string[]
    {
        "https://www.wikihow.com/Build-a-Flat-Bottomed-Boat",
        "https://www.instructables.com/How-to-Build-a-Simple-Wooden-Boat/",
        "https://www.popularmechanics.com/home/how-to/a6244/how-to-build-a-boat/",
        "https://www.diynetwork.com/how-to/skills-and-know-how/carpentry-and-woodworking/how-to-build-a-wooden-boat"
    };

    private List<string> scrapedInstructions = new List<string>();

    public IEnumerator AttemptScrape(string url)
    {
        Debug.LogWarning("Attempting web scrape (this probably won't work)...");

        UnityWebRequest request = UnityWebRequest.Get(url);
        request.SetRequestHeader("User-Agent", "Mozilla/5.0");

        yield return request.SendWebRequest();

        if (request.result != UnityWebRequest.Result.Success)
        {
            Debug.LogError($"Scrape failed: {request.error}");
            Debug.LogError("Reason: Quest doesn't handle web requests well, sites block bots, CORS issues");
            yield break;
        }

        string html = request.downloadHandler.text;

        TryParseWikiHow(html);
        TryParseInstructables(html);
        TryParseGenericHTML(html);

        if (scrapedInstructions.Count == 0)
        {
            Debug.LogError("Parsing failed. HTML structure didn't match expected patterns.");
        }
    }

    void TryParseWikiHow(string html)
    {
        MatchCollection steps = Regex.Matches(html,
            @"<div[^>]*class=[""']step[""'][^>]*>(.*?)</div>",
            RegexOptions.Singleline);

        foreach (Match match in steps)
        {
            string stepText = StripHTML(match.Groups[1].Value);
            if (stepText.Length > 30)
            {
                scrapedInstructions.Add(stepText);
            }
        }
    }

    void TryParseInstructables(string html)
    {
        MatchCollection steps = Regex.Matches(html,
            @"<div[^>]*class=[""']step-body[""'][^>]*>(.*?)</div>",
            RegexOptions.Singleline);

        foreach (Match match in steps)
        {
            string stepText = StripHTML(match.Groups[1].Value);
            if (stepText.Length > 30)
            {
                scrapedInstructions.Add(stepText);
            }
        }
    }

    void TryParseGenericHTML(string html)
    {
        MatchCollection listItems = Regex.Matches(html, @"<li>(.*?)</li>", RegexOptions.Singleline);

        foreach (Match match in listItems)
        {
            string text = StripHTML(match.Groups[1].Value);
            if (text.Length > 20 && text.Length < 500)
            {
                scrapedInstructions.Add(text);
            }
        }
    }

    string StripHTML(string html)
    {
        string text = Regex.Replace(html, @"<[^>]+>", " ");
        text = System.Net.WebUtility.HtmlDecode(text);
        text = Regex.Replace(text, @"\s+", " ");
        return text.Trim();
    }

    public string[] GetScrapedInstructions()
    {
        if (scrapedInstructions.Count == 0)
        {
            Debug.LogWarning("Scraping didn't work. Using hardcoded fallback.");
        }
        return scrapedInstructions.ToArray();
    }

    void TestAllURLs()
    {
        foreach (string url in attemptedURLs)
        {
            StartCoroutine(AttemptScrape(url));
        }
    }
}

// We tried :(