render-html.com: Web scraping and unblocking service

Make a demo request

How to use

Endpoints

The endpoint to scrape data is https://render-html.com/render. It expects request parameters in the query string and some of the request headers are forwarded to the scraped URL. An endpoint to test the integration is https://render-html.com/test.

Example usage

The following examples are command-line applications that require that the environment variable RENDER_HTML_KEY be set.

Shell
Python/Scrapy
PHP
Node.js/Crawlee

#!/usr/bin/env sh

url=https://render-html.com/test?redirect

options="\
&render_mode=screenshot\
&format=content\
&viewport_width=800\
&viewport_height=800\
"

api_url="https://render-html.com/render?$options&url_raw=$url"
curl --fail-with-body --no-progress-meter $api_url \
    -H "authorization:Bearer $RENDER_HTML_API_KEY" \
    > curl_output || { cat curl_output; exit;}
xdg-open curl_output

#!/usr/bin/env python

import re
import os
import scrapy
from dataclasses import dataclass
from scrapy.crawler import CrawlerProcess


@dataclass
class RenderHtmlResponse:
    status: bytes
    url: bytes
    original_url: None|bytes = None
    content_type: None|bytes = None
    raw_data: None|bytes = None
    browser_html: None|bytes = None
    screenshot: None|bytes = None

    @staticmethod
    def deserialize(body: bytes):
        return RenderHtmlResponse(**{
            key.decode(): value.replace(b'\n ', b'\n')
            for key, value in re.findall(rb'(.*?)(?:=|\n )([\s\S]*?)(?:\n(?! )|$)', body)
        })


class RenderHtmlMiddleware:
    API_BASE_URL = 'https://render-html.com/render?'

    def process_request(self, request, spider):
        options = request.meta.get('render_html_options', '')
        if options is False:
            return
        if request.url.startswith(self.API_BASE_URL):
            return
        api_key = spider.settings['RENDER_HTML_API_KEY']
        return request.replace(
            url=self.API_BASE_URL + f'{options}&url_raw={request.url}',
            headers={**request.headers, 'authorization': b'Bearer ' + api_key}
        )

    def process_response(self, response, request, spider):
        if not request.url.startswith(self.API_BASE_URL):
            return
        if response.status != 200:
            raise Exception(response.body)
        rhr = RenderHtmlResponse.deserialize(response.body)
        response.headers.update({'content-type': rhr.content_type})
        response = response.replace(
            url=rhr.url.decode(),
            body=rhr.raw_data,
            status=rhr.status,
            request=request.replace(url=rhr.original_url.decode())
        )
        response.render_html_response = rhr
        return response


class ExampleSpider(scrapy.Spider):
    name = 'ExamplesSpider'

    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {RenderHtmlMiddleware: 0}
    }

    def start_requests(self):
        yield scrapy.Request('https://render-html.com/test?redirect')

    def parse(self, response):
        print(response.request)
        print(response)
        print(response.headers)
        print(response.body.decode())


def main():
    render_html_api_key=os.environ['RENDER_HTML_API_KEY'].encode()
    process = CrawlerProcess({
        'LOG_LEVEL': 'ERROR',
        'RENDER_HTML_API_KEY': render_html_api_key
    })
    process.crawl(ExampleSpider)
    process.start()


if __name__ == '__main__':
    main()

#!/usr/bin/env php
<?php declare(strict_types=1);

class RenderHtmlException extends Exception {}

class RenderHtmlResponse
{
    function __construct(
        public ?string $original_url = null,
        public ?string $url = null,
        public ?string $status = null,
        public ?string $content_type = null,
        public ?string $raw_data = null,
        public ?string $browser_html = null,
        public ?string $screenshot = null,
        ...$args
    ) {}

    static function deserialize(string $body): RenderHtmlResponse
    {
        preg_match_all('/(.*?)(?:=|\n )([\s\S]*?)(?:\n(?! )|$)/', $body, $matches);
        return new RenderHtmlResponse(...array_combine(
            $matches[1],
            array_map(fn ($x) => str_replace("\n ", "\n", $x), $matches[2])
        ));
    }
}

function make_render_html_request(string $api_key, string $url,
    string $options=''): RenderHtmlResponse
{
    $url = "https://render-html.com/render?{$options}&url_raw=$url";
    $curl = curl_init();
    try {
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_HTTPHEADER, ["authorization:Bearer $api_key", 'accept:']);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
        $body = curl_exec($curl);
        if ($body === false) {
            throw new Exception(curl_error($curl));
        }
        $status = curl_getinfo($curl, CURLINFO_HTTP_CODE);
        if ($status !== 200) {
            throw new RenderHtmlException($body);
        }
        return RenderHtmlResponse::deserialize($body);
    }
    finally {
        curl_close($curl);
    }
}

function main()
{
    $render_html_api_key = getenv('RENDER_HTML_API_KEY');
    try {
        $response = make_render_html_request(
            $render_html_api_key,
            'http://example.org',
            'render_mode=raw_data,screenshot&viewport_width=200&viewport_height=800'
        );
    }
    catch (RenderHtmlException $e) {
        printf("Failed to fetch the response: " . $e->getMessage() . "\n");
        return;
    }
    foreach ($response as $key => $value) {
        $value = $value ?? '(null)';
        print("\e[1m$key:\e[0m\n$value\n\n");
    }
}

main();

#!/usr/bin/env node

const crawlee = require('crawlee');

class RenderHtmlCrawler extends crawlee.BasicCrawler
{
    constructor({render_html_api_key, requestHandler})
    {
        super({requestHandler});
        this.render_html_api_key = render_html_api_key;
    }

    async _runRequestHandler(ctx)
    {
        ctx.request.headers['user-agent'] = 'Crawlee';
        ctx.request.url = `https://render-html.com/render?` +
            `render_mode=screenshot&viewport_width=500&url_raw=${ctx.request.url}`;
        ctx.request.headers['authorization'] = 'Bearer ' + this.render_html_api_key;
        await super._runRequestHandler(ctx);
    }
}

async function requestHandler({request, sendRequest})
{
    const response = await sendRequest();
    if (response.statusCode !== 200) {
        throw response.body;
    }
    let data = Object.fromEntries(
        response.rawBody.toString('binary').matchAll(/(.*?)(?:=|\n )([\s\S]*?)(?:\n(?! )|$)/g)
        .map(x => [x[1], x[2].replaceAll('\n ', '\n')])
    );
    const fs = require('node:fs');
    fs.writeFileSync('screenshot.png', Buffer.from(data['screenshot'], 'binary'));
}

function main()
{
    const crawler = new RenderHtmlCrawler({
        render_html_api_key: process.env.RENDER_HTML_API_KEY,
        requestHandler
    });
    crawler.run([
        new Request('https://render-html.com/test?redirect')
    ]);
}

main();

Request parameters

Parameters are passed to the API endpoint via the GET query string, which contains key-value pairs encoded according to RFC 3986, except for the url_raw option, which if used must come last and is not encoded.

GET and POST request methods are supported, as well as the forwarding of supported HTTP headers. But the following headers will be ignored: user-agent, referer, cookie, authorization, proxy-authorization.

Here follows a description of the supported parameters.

render_mode: Comma-separated list of some of the values raw_data, browser_html, screenshot. The default value is raw_data and the other modes are only supported for the GET request method. Our unblocking capabilities are the same for all render modes. The loading of third-party iframes is disabled except for captcha solving purposes.

url or url_raw: The target URL. url_raw expects no additional percent-encoding, so it can be conveniently copied directly from the browser’s address bar, and it must be the last parameter, because everything that follows is treated as part of the target URL.

viewport_width: The width of the screenshot in pixel. Mandatory when using the screenshot render mode.

viewport_height: The height of the screenshot in pixel. When omitted, it will make a full-page screenshot, which means to use the height of the scrollable area.

format: Can be key_value (default) or content. With content one obtains the data of a single render mode in the response body. With key_value, the content is a repetition of the sequence key=value where newlines in the value are escaped by inserting a space after them. The values for the keys raw_data and screenshot are in general binary, whereas the value for browser_html always has the UTF-8 encoding.

render-html.com
Web scraping and unblocking service

Make a demo request

How to use

Endpoints

Example usage

Request parameters

Pricing

Contact

render-html.comWeb scraping and unblocking service

Make a demo request

How to use

Endpoints

Example usage

Request parameters

Pricing

Contact

render-html.com
Web scraping and unblocking service