Skip to content

How to get the raw HTML of a page #118

@tamoyal

Description

@tamoyal

What version of go-chrome are you using (tag, hash, etc.)?

32dfd32

Issue

I understand how to get the DOM as an object but using a similar methodology to get the raw html is not working. This is the code:

var err error

	// chrome_path := "/usr/bin/google-chrome"
	chrome_path := "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

	// Define a chrome instance with remote debugging enabled.
	browser := chrome.New(
		// See https://developers.google.com/web/updates/2017/04/headless-chrome#cli
		// for details about startup flags
		&chrome.Flags{
			"addr":               "localhost",
			"disable-extensions": nil,
			"disable-gpu":        nil,
			"headless":           true,
			"hide-scrollbars":    nil,
			"no-first-run":       nil,
			"no-sandbox":         nil,
			"port":               9222,
			"remote-debugging-address": "0.0.0.0",
			"remote-debugging-port":    9222,
		},
		chrome_path, // Path to Chromeium binary
		"/tmp",      // Set the Chromium working directory
		"/dev/null", // Ignore internal Chromium output, set to empty string for os.Stdout
		"/dev/null", // Ignore internal Chromium errors, set to empty string for os.Stderr
	)

	// Start the chrome process.
	if err := browser.Launch(); nil != err {
		panic(err)
	}

	// Open a tab and navigate to the URL you want to screenshot.
	tab, err := browser.NewTab("http://www.brainjar.com/java/host/test.html")
	if nil != err {
		panic(err)
	}

	// Enable Page events for this tab.
	if enableResult := <-tab.Page().Enable(); nil != enableResult.Err {
		panic(enableResult.Err)
	}

	// Enable the DOM agent for this tab.
	if enableResult := <-tab.DOM().Enable(); nil != enableResult.Err {
		panic(enableResult.Err)
	}

	// Create a channel to receive the DOM data.
	outer_html_chan := make(chan *dom.GetOuterHTMLResult)

	// When the page load event fires, deliver the root DOM node.
	tab.Page().OnLoadEventFired(func(event *page.LoadEventFiredEvent) {
		params := &dom.GetOuterHTMLParams{
			NodeID:        dom.NodeID(1),
			// BackendNodeID: dom.BackendNodeID(1),
			// ObjectID:      runtime.RemoteObjectID("remote-object-id"),
		}
		outer_html_chan <- <-tab.DOM().GetOuterHTML(params)
	})

	result := <-outer_html_chan
	tmp, _ := json.MarshalIndent(result, "", "    ")
	fmt.Printf("%s\n\n", string(tmp))

Which prints:

{
    "outerHTML": ""
}

The Google docs aren't helping me understand what's going on behind the scenes that much so any help would be appreciated. Thanks!

Metadata

Metadata

Assignees

Labels

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions