diff --git a/Makefile b/Makefile index 4fae9b6b..fe42d480 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ SHELL := bash # this is the upstream libcurl-impersonate version -VERSION := 0.6.1 +VERSION := 0.6.2b2 CURL_VERSION := curl-8.1.1 $(CURL_VERSION): diff --git a/README-zh.md b/README-zh.md index 49978156..2b803902 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,18 +1,37 @@ # curl_cffi +[![Downloads](https://static.pepy.tech/badge/curl_cffi/week)](https://pepy.tech/project/curl_cffi) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/curl_cffi) +[![PyPI version](https://badge.fury.io/py/curl-cffi.svg)](https://badge.fury.io/py/curl-cffi) + [curl-impersonate](https://github.com/lwthiker/curl-impersonate) 的 Python 绑定,基于 [cffi](https://cffi.readthedocs.io/en/latest/). -不同于其他的纯 Python http 客户端,比如 `httpx` 和 `requests`,`curl_cffi `可以模拟浏览器的 -TLS 或者 JA3 指纹。如果你莫名其妙地被某个网站封锁了,可以来试试这个库。 +不同于其他的纯 Python http 客户端,比如 `httpx` 和 `requests`,`curl_cffi ` 可以模拟浏览器的 +TLS/JA3 和 HTTP/2 指纹。如果你莫名其妙地被某个网站封锁了,可以来试试 `curl_cffi`。 + +------ + +Scrapfly.io + +[Scrapfly](https://scrapfly.io/?utm_source=github&utm_medium=sponsoring&utm_campaign=curl_cffi) +是一个企业级的网页抓取 API,通过全流程托管来帮助你简化抓取流程。功能包括:真实浏览器 +渲染,代理自动切换,和 TLS、HTTP、浏览器指纹模拟,可以突破所有主要的反爬手段。Scrapfly +还提供了一个监控面板,让你能够随时观察抓取成功率。 + +如果你在寻找云端托管 `curl_cffi` 服务的话,Scrapfly 是一个不错的选择。如果你希望自己管理 +脚本,他们还提供了一个[工具](https://scrapfly.io/web-scraping-tools/curl-python/curl_cffi), +可以把 curl 命令直接转换成 `curl_cffi` 的 Python 代码。 + +------ ## 功能 - 支持 JA3/TLS 和 http2 指纹模拟。 -- 比 requests/tls_client 快得多,和 aiohttp/pycurl 的速度比肩,详情查看 [benchmarks](https://github.com/yifeikong/curl_cffi/tree/master/benchmark)。 +- 比 requests/httpx 快得多,和 aiohttp/pycurl 的速度比肩,详见 [benchmarks](https://github.com/yifeikong/curl_cffi/tree/master/benchmark)。 - 模仿 requests 的 API,不用再学一个新的。 -- 预编译,不需要再自己机器上再弄一遍。 -- 支持 `asyncio`,并且每个请求都可以换代理。 +- 预编译,不需要在自己机器上从头开始。 +- 支持 `asyncio`,并且支持每个请求切换代理。 - 支持 http 2.0,requests 不支持。 - 支持 websocket。 @@ -54,18 +73,23 @@ TLS 或者 JA3 指纹。如果你莫名其妙地被某个网站封锁了,可 from curl_cffi import requests # 注意 impersonate 这个参数 -r = requests.get("https://tls.browserleaks.com/json", impersonate="chrome110") +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome110") print(r.json()) # output: {..., "ja3n_hash": "aa56c057ad164ec4fdcb7a5a283be9fc", ...} # ja3n 指纹和目标浏览器一致 +# To keep using the latest browser version as `curl_cffi` updates, +# simply set impersonate="chrome" without specifying a version. +# Other similar values are: "safari" and "safari_ios" +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome") + # 支持使用代理 proxies = {"https": "http://localhost:3128"} -r = requests.get("https://tls.browserleaks.com/json", impersonate="chrome110", proxies=proxies) +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome110", proxies=proxies) proxies = {"https": "socks://localhost:3128"} -r = requests.get("https://tls.browserleaks.com/json", impersonate="chrome110", proxies=proxies) +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome110", proxies=proxies) ``` ### Sessions @@ -152,35 +176,7 @@ with Session() as s: ws.run_forever() ``` -### 类 curl - -另外,你还可以使用类似 curl 的底层 API: - -```python -from curl_cffi import Curl, CurlOpt -from io import BytesIO - -buffer = BytesIO() -c = Curl() -c.setopt(CurlOpt.URL, b'https://tls.browserleaks.com/json') -c.setopt(CurlOpt.WRITEDATA, buffer) - -c.impersonate("chrome110") - -c.perform() -c.close() -body = buffer.getvalue() -print(body.decode()) -``` - -更多细节请查看 [英文文档](https://curl-cffi.readthedocs.io)。 - -### scrapy - -如果你用 scrapy 的话,可以参考这些中间件: - -- [tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint) -- [jxlil/scrapy-impersonate](https://github.com/jxlil/scrapy-impersonate) +对于底层 API, Scrapy 集成等进阶话题, 请查阅 [文档](https://curl-cffi.readthedocs.io) 有问题和建议请优先提 issue,中英文均可,也可以加 [TG 群](https://t.me/+lL9n33eZp480MGM1) 或微信群讨论: diff --git a/README.md b/README.md index a2d64f60..8f7db86e 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,35 @@ # curl_cffi -Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate) -via [cffi](https://cffi.readthedocs.io/en/latest/). +[![Downloads](https://static.pepy.tech/badge/curl_cffi/week)](https://pepy.tech/project/curl_cffi) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/curl_cffi) +[![PyPI version](https://badge.fury.io/py/curl-cffi.svg)](https://badge.fury.io/py/curl-cffi) [Documentation](https://curl-cffi.readthedocs.io) | [中文 README](https://github.com/yifeikong/curl_cffi/blob/main/README-zh.md) | [Discuss on Telegram](https://t.me/+lL9n33eZp480MGM1) +Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate) +via [cffi](https://cffi.readthedocs.io/en/latest/). + Unlike other pure python http clients like `httpx` or `requests`, `curl_cffi` can -impersonate browsers' TLS signatures or JA3 fingerprints. If you are blocked by some -website for no obvious reason, you can give this package a try. +impersonate browsers' TLS/JA3 and HTTP/2 fingerprints. If you are blocked by some +website for no obvious reason, you can give `curl_cffi` a try. + +------ + +Scrapfly.io + +[Scrapfly](https://scrapfly.io/?utm_source=github&utm_medium=sponsoring&utm_campaign=curl_cffi) +is an enterprise-grade solution providing Web Scraping API that aims to simplify the +scraping process by managing everything: real browser rendering, rotating proxies, and +fingerprints (TLS, HTTP, browser) to bypass all major anti-bots. Scrapfly also unlocks the +observability by providing an analytical dashboard and measuring the success rate/block +rate in detail. + +Scrapfly is a good solution if you are looking for a cloud-managed solution for `curl_cffi`. +If you are managing TLS/HTTP fingerprint by yourself with `curl_cffi`, they also maintain +[this tool](https://scrapfly.io/web-scraping-tools/curl-python/curl_cffi) to convert curl +command into python curl_cffi code! + +------ ## Features @@ -19,7 +41,7 @@ website for no obvious reason, you can give this package a try. - Supports http 2.0, which requests does not. - Supports websocket. -|library|requests|aiohttp|httpx|pycurl|curl_cffi| +||requests|aiohttp|httpx|pycurl|curl_cffi| |---|---|---|---|---|---| |http2|❌|❌|✅|✅|✅| |sync|✅|❌|✅|✅|✅| @@ -49,6 +71,8 @@ To install unstable version from GitHub: ## Usage +`curl_cffi` comes with a low-level `curl` API and a high-level `requests`-like API. + Use the latest impersonate versions, do NOT copy `chrome110` here without changing. ### requests-like @@ -57,29 +81,36 @@ Use the latest impersonate versions, do NOT copy `chrome110` here without changi from curl_cffi import requests # Notice the impersonate parameter -r = requests.get("https://tls.browserleaks.com/json", impersonate="chrome110") +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome110") print(r.json()) # output: {..., "ja3n_hash": "aa56c057ad164ec4fdcb7a5a283be9fc", ...} # the js3n fingerprint should be the same as target browser +# To keep using the latest browser version as `curl_cffi` updates, +# simply set impersonate="chrome" without specifying a version. +# Other similar values are: "safari" and "safari_ios" +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome") + # http/socks proxies are supported proxies = {"https": "http://localhost:3128"} -r = requests.get("https://tls.browserleaks.com/json", impersonate="chrome110", proxies=proxies) +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome110", proxies=proxies) proxies = {"https": "socks://localhost:3128"} -r = requests.get("https://tls.browserleaks.com/json", impersonate="chrome110", proxies=proxies) +r = requests.get("https://tools.scrapfly.io/api/fp/ja3", impersonate="chrome110", proxies=proxies) ``` ### Sessions ```python -# sessions are supported s = requests.Session() -# httpbin is a http test website + +# httpbin is a http test website, this endpoint makes the server set cookies s.get("https://httpbin.org/cookies/set/foo/bar") print(s.cookies) # ]> + +# retrieve cookies again to verify r = s.get("https://httpbin.org/cookies") print(r.json()) # {'cookies': {'foo': 'bar'}} @@ -108,7 +139,7 @@ However, only Chrome-like browsers are supported. Firefox support is tracked in Notes: 1. Added in version `0.6.0`. -2. fixed in version `0.6.0`, previous http2 fingerprints were [not correct](https://github.com/lwthiker/curl-impersonate/issues/215). +2. Fixed in version `0.6.0`, previous http2 fingerprints were [not correct](https://github.com/lwthiker/curl-impersonate/issues/215). ### asyncio @@ -155,35 +186,8 @@ with Session() as s: ws.run_forever() ``` -### curl-like - -Alternatively, you can use the low-level curl-like API: - -```python -from curl_cffi import Curl, CurlOpt -from io import BytesIO - -buffer = BytesIO() -c = Curl() -c.setopt(CurlOpt.URL, b'https://tls.browserleaks.com/json') -c.setopt(CurlOpt.WRITEDATA, buffer) - -c.impersonate("chrome110") - -c.perform() -c.close() -body = buffer.getvalue() -print(body.decode()) -``` - -See the [docs](https://curl-cffi.readthedocs.io) for more details. - -### scrapy - -If you are using scrapy, check out these middlewares: - -- [tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint) -- [jxlil/scrapy-impersonate](https://github.com/jxlil/scrapy-impersonate) +For low-level APIs, Scrapy integration and other advanced topics, see the +[docs](https://curl-cffi.readthedocs.io) for more details. ## Acknowledgement @@ -203,7 +207,7 @@ Yescaptcha is a proxy service that bypasses Cloudflare and uses the API interfac Scrape Ninja [ScrapeNinja](https://scrapeninja.net?utm_source=github&utm_medium=banner&utm_campaign=cffi) is a web scraping API with two engines: fast, with high performance and TLS -fingerprint; and slower with a real browser under the hood. +fingerprint; and slower with a real browser under the hood. ScrapeNinja handles headless browsers, proxies, timeouts, retries, and helps with data extraction, so you can just get the data in JSON. Rotating proxies are available out of diff --git a/assets/scrapfly.png b/assets/scrapfly.png new file mode 100644 index 00000000..e4b384f6 Binary files /dev/null and b/assets/scrapfly.png differ diff --git a/docs/advanced.rst b/docs/advanced.rst index 99e20928..0946ade6 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -16,7 +16,7 @@ Alternatively, you can use the low-level curl-like API: c.setopt(CurlOpt.URL, b'https://tls.browserleaks.com/json') c.setopt(CurlOpt.WRITEDATA, buffer) - c.impersonate("chrome110") + c.impersonate("chrome120") c.perform() c.close() diff --git a/docs/dev.rst b/docs/dev.rst index 06b71a0b..5d0a5a63 100644 --- a/docs/dev.rst +++ b/docs/dev.rst @@ -5,7 +5,7 @@ This page documents how to compile curl-impersonate and curl-cffi from source. I package is not available on your platform, you may refer to this page for some inspirations. First, you need to check if there are libcurl-impersonate binaries for you platform. If -so, you can +so, you can simply download and install them. For now, a pre-compiled `libcurl-impersonate` is downloaded from github and built into a bdist wheel, which is a binary package format used by PyPI. However, the diff --git a/docs/faq.rst b/docs/faq.rst index 500146b1..a3cc9133 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -91,7 +91,7 @@ To force curl to use http 1.1 only. from curl_cffi import requests, CurlHttpVersion - r = requests.get("https://postman-echo.com", http_version=CurlHttpVersion.v1_1) + r = requests.get("https://postman-echo.com", http_version=CurlHttpVersion.V1_1) Related issues: @@ -158,5 +158,3 @@ your own headers. requests.get(url, impersonate="chrome", default_headers=False, headers=...) - - diff --git a/docs/impersonate.rst b/docs/impersonate.rst index e98106d8..64d2f41b 100644 --- a/docs/impersonate.rst +++ b/docs/impersonate.rst @@ -104,7 +104,7 @@ is that, for a given browser version, the fingerprints are fixed. If you create random fingerprints, the server is easy to know that you are not using a typical browser. If you were thinking about ``ja3``, and not ``ja3n``, then the fingerprints is already -randomnized, due to the ``extension permutation`` feature introduced in Chrome 110. +randomized, due to the ``extension permutation`` feature introduced in Chrome 110. AFAIK, most websites use an allowlist, not a blocklist to filter out bot traffic. So I don’t think random ja3 fingerprints would work in the wild. diff --git a/libs.json b/libs.json index 9809798b..ae652eca 100644 --- a/libs.json +++ b/libs.json @@ -40,16 +40,53 @@ "machine": "x86_64", "pointer_size": 64, "libdir": "", - "sysname": "linux-gnu", + "sysname": "linux", + "link_type": "static", + "libc": "gnu", "so_name": "libcurl-impersonate-chrome.so", "so_arch": "x86_64" }, + { + "system": "Linux", + "machine": "x86_64", + "pointer_size": 64, + "libdir": "", + "sysname": "linux", + "link_type": "static", + "libc": "musl", + "so_name": "libcurl-impersonate-chrome.so", + "so_arch": "x86_64" + }, + { + "system": "Linux", + "machine": "i686", + "pointer_size": 32, + "libdir": "", + "sysname": "linux", + "link_type": "static", + "libc": "gnu", + "so_name": "libcurl-impersonate-chrome.so", + "so_arch": "i386" + }, + { + "system": "Linux", + "machine": "aarch64", + "pointer_size": 64, + "libdir": "", + "sysname": "linux", + "link_type": "static", + "libc": "gnu", + "so_name": "libcurl-impersonate-chrome.so", + "so_arch": "aarch64" + }, { "system": "Linux", "machine": "aarch64", "pointer_size": 64, "libdir": "", - "sysname": "linux-gnu", + "sysname": "linux", + "link_type": "dynamic", + "libc": "musl", "so_name": "libcurl-impersonate-chrome.so", "so_arch": "aarch64" }, @@ -58,7 +95,9 @@ "machine": "armv6l", "pointer_size": 32, "libdir": "", - "sysname": "linux-gnueabihf", + "sysname": "linux", + "link_type": "static", + "libc": "gnueabihf", "so_name": "libcurl-impersonate-chrome.so", "so_arch": "arm" }, @@ -67,7 +106,9 @@ "machine": "armv7l", "pointer_size": 32, "libdir": "", - "sysname": "linux-gnueabihf", + "sysname": "linux", + "link_type": "static", + "libc": "gnueabihf", "so_name": "libcurl-impersonate-chrome.so", "so_arch": "arm" } diff --git a/pyproject.toml b/pyproject.toml index a1d4466b..a0e31be7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [project] name = "curl_cffi" -version = "0.6.2" +version = "0.6.3b1" authors = [{ name = "Yifei Kong", email = "kong@yifei.me" }] -description = "libcurl ffi bindings for Python, with impersonation support" +description = "libcurl ffi bindings for Python, with impersonation support." license = { file = "LICENSE" } dependencies = [ "cffi>=1.12.0", @@ -44,7 +44,6 @@ dev = [ "uvicorn==0.18.3", "websockets==11.0.3", "ruff==0.1.14", - "nest_asyncio==1.6.0", ] build = [ "cibuildwheel", @@ -65,7 +64,6 @@ test = [ "websockets==11.0.3", "python-multipart==0.0.6", "fastapi==0.100.0", - "nest_asyncio==1.6.0", ] @@ -91,12 +89,15 @@ build = [ "cp38-win32", "cp38-manylinux_x86_64", "cp38-manylinux_aarch64", + "cp38-manylinux_i686", + "cp38-musllinux_x86_64", + "cp38-musllinux_aarch64", ] before-all = "make preprocess" test-requires = "pytest" test-command = "python -bb -m pytest {project}/tests/unittest" test-extras = ["test"] -test-skip = "pp*" +test-skip = "cp38-manylinux_i686" # trustme not available build-verbosity = 1 diff --git a/scripts/build.py b/scripts/build.py index 01b8d7cd..b00ef2e7 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -10,19 +10,25 @@ from cffi import FFI # this is the upstream libcurl-impersonate version -__version__ = "0.6.2b1" +__version__ = "0.6.2b2" tmpdir = None def detect_arch(): with open(Path(__file__).parent.parent / "libs.json") as f: archs = json.loads(f.read()) + + libc, _ = platform.libc_ver() + # https://github.com/python/cpython/issues/87414 + libc = "gnu" if libc == "glibc" else "musl" uname = platform.uname() pointer_size = struct.calcsize("P") * 8 + for arch in archs: if ( arch["system"] == uname.system and arch["machine"] == uname.machine and arch["pointer_size"] == pointer_size + and ("libc" not in arch or arch.get("libc") == libc) ): if arch["libdir"]: arch["libdir"] = os.path.expanduser(arch["libdir"]) @@ -43,10 +49,14 @@ def download_libcurl(): return file = "libcurl-impersonate.tar.gz" + if arch["system"] == "Linux": + sysname = "linux-" + arch["libc"] + else: + sysname = arch["sysname"] url = ( f"https://github.com/yifeikong/curl-impersonate/releases/download/" f"v{__version__}/libcurl-impersonate-v{__version__}" - f".{arch['so_arch']}-{arch['sysname']}.tar.gz" + f".{arch['so_arch']}-{sysname}.tar.gz" ) print(f"Downloading libcurl-impersonate-chrome from {url}...") @@ -60,7 +70,7 @@ def download_libcurl(): shutil.copy2(f"{arch['libdir']}/libcurl.dll", "curl_cffi") def get_curl_archives(): - if arch["system"] == "Linux": + if arch["system"] == "Linux" and arch.get("link_type") == "static": # note that the order of libraries matters # https://stackoverflow.com/a/36581865 return [ @@ -81,6 +91,8 @@ def get_curl_libraries(): return ["libcurl"] elif arch["system"] == "Darwin": return ["curl-impersonate-chrome"] + elif arch["system"] == "Linux" and arch.get("link_type") == "dynamic": + return ["curl-impersonate-chrome"] else: return []