https://render-html.com/render. It expects request parameters
in the query string and some of the request headers are forwarded to the scraped URL. An endpoint
to test the integration is https://render-html.com/test.
RENDER_HTML_KEY be set.#!/usr/bin/env sh
url=https://render-html.com/test?redirect
options="\
&render_mode=screenshot\
&format=content\
&viewport_width=800\
&viewport_height=800\
"
api_url="https://render-html.com/render?$options&url_raw=$url"
curl --fail-with-body --no-progress-meter $api_url \
-H "authorization:Bearer $RENDER_HTML_API_KEY" \
> curl_output || { cat curl_output; exit;}
xdg-open curl_output
#!/usr/bin/env python
import re
import os
import scrapy
from dataclasses import dataclass
from scrapy.crawler import CrawlerProcess
@dataclass
class RenderHtmlResponse:
status: bytes
url: bytes
original_url: None|bytes = None
content_type: None|bytes = None
raw_data: None|bytes = None
browser_html: None|bytes = None
screenshot: None|bytes = None
@staticmethod
def deserialize(body: bytes):
return RenderHtmlResponse(**{
key.decode(): value.replace(b'\n ', b'\n')
for key, value in re.findall(rb'(.*?)(?:=|\n )([\s\S]*?)(?:\n(?! )|$)', body)
})
class RenderHtmlMiddleware:
API_BASE_URL = 'https://render-html.com/render?'
def process_request(self, request, spider):
options = request.meta.get('render_html_options', '')
if options is False:
return
if request.url.startswith(self.API_BASE_URL):
return
api_key = spider.settings['RENDER_HTML_API_KEY']
return request.replace(
url=self.API_BASE_URL + f'{options}&url_raw={request.url}',
headers={**request.headers, 'authorization': b'Bearer ' + api_key}
)
def process_response(self, response, request, spider):
if not request.url.startswith(self.API_BASE_URL):
return
if response.status != 200:
raise Exception(response.body)
rhr = RenderHtmlResponse.deserialize(response.body)
response.headers.update({'content-type': rhr.content_type})
response = response.replace(
url=rhr.url.decode(),
body=rhr.raw_data,
status=rhr.status,
request=request.replace(url=rhr.original_url.decode())
)
response.render_html_response = rhr
return response
class ExampleSpider(scrapy.Spider):
name = 'ExamplesSpider'
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {RenderHtmlMiddleware: 0}
}
def start_requests(self):
yield scrapy.Request('https://render-html.com/test?redirect')
def parse(self, response):
print(response.request)
print(response)
print(response.headers)
print(response.body.decode())
def main():
render_html_api_key=os.environ['RENDER_HTML_API_KEY'].encode()
process = CrawlerProcess({
'LOG_LEVEL': 'ERROR',
'RENDER_HTML_API_KEY': render_html_api_key
})
process.crawl(ExampleSpider)
process.start()
if __name__ == '__main__':
main()
#!/usr/bin/env php
<?php declare(strict_types=1);
class RenderHtmlException extends Exception {}
class RenderHtmlResponse
{
function __construct(
public ?string $original_url = null,
public ?string $url = null,
public ?string $status = null,
public ?string $content_type = null,
public ?string $raw_data = null,
public ?string $browser_html = null,
public ?string $screenshot = null,
...$args
) {}
static function deserialize(string $body): RenderHtmlResponse
{
preg_match_all('/(.*?)(?:=|\n )([\s\S]*?)(?:\n(?! )|$)/', $body, $matches);
return new RenderHtmlResponse(...array_combine(
$matches[1],
array_map(fn ($x) => str_replace("\n ", "\n", $x), $matches[2])
));
}
}
function make_render_html_request(string $api_key, string $url,
string $options=''): RenderHtmlResponse
{
$url = "https://render-html.com/render?{$options}&url_raw=$url";
$curl = curl_init();
try {
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HTTPHEADER, ["authorization:Bearer $api_key", 'accept:']);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
$body = curl_exec($curl);
if ($body === false) {
throw new Exception(curl_error($curl));
}
$status = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status !== 200) {
throw new RenderHtmlException($body);
}
return RenderHtmlResponse::deserialize($body);
}
finally {
curl_close($curl);
}
}
function main()
{
$render_html_api_key = getenv('RENDER_HTML_API_KEY');
try {
$response = make_render_html_request(
$render_html_api_key,
'http://example.org',
'render_mode=raw_data,screenshot&viewport_width=200&viewport_height=800'
);
}
catch (RenderHtmlException $e) {
printf("Failed to fetch the response: " . $e->getMessage() . "\n");
return;
}
foreach ($response as $key => $value) {
$value = $value ?? '(null)';
print("\e[1m$key:\e[0m\n$value\n\n");
}
}
main();
#!/usr/bin/env node
const crawlee = require('crawlee');
class RenderHtmlCrawler extends crawlee.BasicCrawler
{
constructor({render_html_api_key, requestHandler})
{
super({requestHandler});
this.render_html_api_key = render_html_api_key;
}
async _runRequestHandler(ctx)
{
ctx.request.headers['user-agent'] = 'Crawlee';
ctx.request.url = `https://render-html.com/render?` +
`render_mode=screenshot&viewport_width=500&url_raw=${ctx.request.url}`;
ctx.request.headers['authorization'] = 'Bearer ' + this.render_html_api_key;
await super._runRequestHandler(ctx);
}
}
async function requestHandler({request, sendRequest})
{
const response = await sendRequest();
if (response.statusCode !== 200) {
throw response.body;
}
let data = Object.fromEntries(
response.rawBody.toString('binary').matchAll(/(.*?)(?:=|\n )([\s\S]*?)(?:\n(?! )|$)/g)
.map(x => [x[1], x[2].replaceAll('\n ', '\n')])
);
const fs = require('node:fs');
fs.writeFileSync('screenshot.png', Buffer.from(data['screenshot'], 'binary'));
}
function main()
{
const crawler = new RenderHtmlCrawler({
render_html_api_key: process.env.RENDER_HTML_API_KEY,
requestHandler
});
crawler.run([
new Request('https://render-html.com/test?redirect')
]);
}
main();
url_raw option, which if used must come last and is not encoded.user-agent, referer,
cookie, authorization, proxy-authorization.render_mode:
Comma-separated list of some of the values raw_data, browser_html,
screenshot. The default value is raw_data and the other modes are only
supported for the GET request method. Our unblocking capabilities are the same
for all render modes.
The loading of third-party iframes is disabled except for captcha solving purposes.
url or url_raw:
The target URL. url_raw expects no additional percent-encoding, so it
can be conveniently copied directly from the browser’s address bar, and it must be the last
parameter, because everything that follows is treated as part of the target URL.
viewport_width:
The width of the screenshot in pixel. Mandatory when using the screenshot render mode.
viewport_height:
The height of the screenshot in pixel. When omitted, it will make a full-page screenshot, which
means to use the height of the scrollable area.
format:
Can be key_value (default) or content.
With content one obtains the data of a single render mode in the
response body.
With key_value, the content is a repetition of the sequence
key=value
where newlines in the value are escaped by inserting a space after them.
The values for the keys raw_data and screenshot are in general binary,
whereas the value for browser_html always has the UTF-8 encoding.
raw_data: 2.00 € per 1000
successful requests.browser_html or screenshot or multiple render
modes combined: 6.00 € per 1000 successful requests.
contact@render-html.com.