Compare commits

...

5 Commits

Author SHA1 Message Date
f9659246ca Implement zippy resolver with js evaluation
- Add ducc crate (duktape bindings) as javascript engine
- Extract the script tag from zippyshare page contents
- Preprocess the script and execute the js to get the link
- This removes the need for full regex based implementations for each
  antiscrape challenge
2022-09-13 15:37:20 +02:00
bc2d312ce9 Update zippyshare resolver 2022-07-24 + bump
- Bump version to 0.1.6
2022-08-16 21:42:08 +02:00
0f7e05a71d Update zippyshare resolver 2022-07-24 + bump
- Bump version to 0.1.5
2022-07-24 15:39:00 +02:00
2e0c12ee56 Bump version to 0.1.4 2022-07-17 23:34:19 +02:00
7606f90384 Update crossterm dependency
- Crossterm 0.24 was released with the merged bugfix, so no need for
  the pinned git dependency anymore
2022-07-17 23:33:21 +02:00
3 changed files with 71 additions and 96 deletions

33
Cargo.lock generated
View File

@ -64,6 +64,12 @@ version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]]
name = "cesu8"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -140,8 +146,9 @@ checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
[[package]] [[package]]
name = "crossterm" name = "crossterm"
version = "0.23.2" version = "0.24.0"
source = "git+https://github.com/crossterm-rs/crossterm.git?rev=21155716e2eedd5ba8ab97168e5eed7cd50d2ad8#21155716e2eedd5ba8ab97168e5eed7cd50d2ad8" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab9f7409c70a38a56216480fba371ee460207dd8926ccf5b4160591759559170"
dependencies = [ dependencies = [
"bitflags", "bitflags",
"crossterm_winapi", "crossterm_winapi",
@ -162,6 +169,25 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "ducc"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41bc1f8a30712eb6a7454f85747f218d9dfb41d173bb223a8c4f18daff829207"
dependencies = [
"cesu8",
"ducc-sys",
]
[[package]]
name = "ducc-sys"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0cdea834bf6a0fde522374db4404695c5f0465fc0ee814f2878d76eaabd4ffed"
dependencies = [
"cc",
]
[[package]] [[package]]
name = "encoding_rs" name = "encoding_rs"
version = "0.8.31" version = "0.8.31"
@ -182,12 +208,13 @@ dependencies = [
[[package]] [[package]]
name = "ffdl" name = "ffdl"
version = "0.1.3" version = "0.1.6"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
"clap", "clap",
"crossterm", "crossterm",
"ducc",
"futures", "futures",
"percent-encoding", "percent-encoding",
"regex", "regex",

View File

@ -1,6 +1,6 @@
[package] [package]
name = "ffdl" name = "ffdl"
version = "0.1.3" version = "0.1.6"
authors = ["daniel m <danielm@dnml.de>"] authors = ["daniel m <danielm@dnml.de>"]
edition = "2021" edition = "2021"
description = "Download files fast" description = "Download files fast"
@ -15,8 +15,5 @@ clap = { version = "3.1.12", features = [ "derive" ] }
chrono = "0.4.19" chrono = "0.4.19"
thiserror = "1.0.30" thiserror = "1.0.30"
anyhow = "1.0.57" anyhow = "1.0.57"
crossterm = "0.24.0"
# crossterm had a bug that caused issues on windows with git-bash. This was fixed in a pr, but the ducc = "0.1.5"
# new version is not yet published to crates.io, so a git dependency with commit pinning is used
# crossterm = "0.23.2"
crossterm = { git = "https://github.com/crossterm-rs/crossterm.git", rev = "21155716e2eedd5ba8ab97168e5eed7cd50d2ad8" }

View File

@ -1,17 +1,18 @@
use std::io::{Error, ErrorKind}; use std::io::{Error, ErrorKind};
use anyhow::Result; use anyhow::Result;
use ducc::Ducc;
use regex::Regex; use regex::Regex;
pub fn is_zippyshare_url(url: &str) -> bool { pub fn is_zippyshare_url(url: &str) -> bool {
Regex::new(r"^https?://(?:www\d*\.)?zippyshare\.com/v/[0-9a-zA-Z]+/file\.html$") Regex::new(r#"^https?://(?:www\d*\.)?zippyshare\.com/v/[0-9a-zA-Z]+/file\.html$"#)
.unwrap() .unwrap()
.is_match(url) .is_match(url)
} }
pub async fn resolve_link(url: &str) -> Result<String> { pub async fn resolve_link(url: &str) -> Result<String> {
// Regex to check if the provided url is a zippyshare download url // Regex to check if the provided url is a zippyshare download url
let re = Regex::new(r"^(https?://(?:www\d*\.)?zippyshare\.com)/v/[0-9a-zA-Z]+/file\.html$")?; let re = Regex::new(r#"^(https?://(?:www\d*\.)?zippyshare\.com)/v/[0-9a-zA-Z]+/file\.html$"#)?;
if !re.is_match(url) { if !re.is_match(url) {
return Err(Error::new(ErrorKind::Other, "URL is not a zippyshare url").into()); return Err(Error::new(ErrorKind::Other, "URL is not a zippyshare url").into());
} }
@ -22,100 +23,50 @@ pub async fn resolve_link(url: &str) -> Result<String> {
// Download the html body for the download page // Download the html body for the download page
let body = reqwest::get(url).await?.text().await?; let body = reqwest::get(url).await?.text().await?;
// Try to extract the link using the latest extractor let re_script =
let link = extract_dl_link_2022_07_17(&host, &body).await; Regex::new(r#"(?ms)<script.*?>(.*getElementById\('dlbutton'\).*?)</script>"#).unwrap();
// Try the previous extractor as fallback if it didn't work let re_script_start = Regex::new(r#"(?ms)<script.*?>"#).unwrap();
let link = match link {
Err(_) => extract_dl_link_2022_03_07(&host, &body).await,
ok => ok,
};
link // Extract the script. This will end at the correct script end, but has stuff before the start
} let cap_tmp = match re_script.captures(&body) {
/*
Updated: 17.07.2022
Link generation code:
- `var a = $1`
- $1 is the only variable that actually changes
- effectively: `var b = "asdasd".substr(0, 3).length` seems to be fixed
- evaluates to: `var b = 3`
- `document.getElementById('dlbutton').href = "/d/0Ky7p1C6/"+(Math.pow(a, 3)+b)+"/some-file-name.part1.rar"`
- evaluates to: `href = "/d/0Ky7p1C6/"+(Math.pow(a, 3)+3)+"/some-file-name.part1.rar"`
```
var a = 114;
document.getElementById('dlbutton').omg = "asdasd".substr(0, 3);
var b = document.getElementById('dlbutton').omg.length;
document.getElementById('dlbutton').href = "/d/0Ky7p1C6/"+(Math.pow(a, 3)+b)+"/some-file-name.part1.rar";
```
*/
pub async fn extract_dl_link_2022_07_17(host: &str, body: &str) -> Result<String> {
let re_var_a = Regex::new(
r#"var a = (\d+);"#
)?;
// Regex to match the javascript part of the html that generates the real download link
let re_link = Regex::new(
r#"document\.getElementById\('dlbutton'\)\.href = "(/d/.+/)"\+\(Math\.pow\(a, 3\)\+b\)\+"(.+)";"#,
)?;
let cap_var_a = match re_var_a.captures(&body) {
Some(cap) => cap,
None => return Err(Error::new(ErrorKind::Other, "Var a not found").into()),
};
let cap_link = match re_link.captures(&body) {
Some(cap) => cap, Some(cap) => cap,
None => return Err(Error::new(ErrorKind::Other, "Link not found").into()), None => return Err(Error::new(ErrorKind::Other, "Link not found").into()),
}; };
let temp = &cap_tmp[1];
let url_start = &cap_link[1]; // Find the correct script start
let url_end = &cap_link[2]; let pos_script_start = match re_script_start.find_iter(&temp).last() {
let var_a: i64 = cap_var_a[1].parse()?;
let middle = var_a.pow(3) + 3;
let dl_url = format!("{}{}{}{}", &host, url_start, middle, url_end);
Ok(dl_url)
}
/*
Updated: 07.03.2022
Link generation code:
- `href = $1 + ($2 % $3 + $4 % $5) + $6`
- `$1` is always `/d/XXX` where XXX is dependent on the file
- `$2`, `$3`, `$4` and `$5` are dynamic and randomly generated on each reload
- `$2` is always the same as `$4`
- `$6` is dependent on the file
- The numbers in the calculation part ($2`, `$3`, `$4` and `$5`) are hard coded
```
document.getElementById('dlbutton').href = "/d/0Ky7p1C6/" + (186549 % 51245 + 186549 % 913) + "/some-file-name.part1.rar";
```
*/
pub async fn extract_dl_link_2022_03_07(host: &str, body: &str) -> Result<String> {
// Regex to match the javascript part of the html that generates the real download link
let re_link = Regex::new(
r#"document\.getElementById\('dlbutton'\)\.href = "(/d/.+/)" \+ \((\d+) % (\d+) \+ \d+ % (\d+)\) \+ "(.+)";"#,
)?;
let cap_link = match re_link.captures(&body) {
Some(cap) => cap, Some(cap) => cap,
None => return Err(Error::new(ErrorKind::Other, "Link not found").into()), None => return Err(Error::new(ErrorKind::Other, "Link not found").into()),
}; };
// Cut off the beginning to get only the script contents
let raw_script = &temp[pos_script_start.end()..];
let url_start = &cap_link[1]; // Preprocess the script
let url_end = &cap_link[5]; let script = preprocess_js(raw_script);
let n2: i32 = cap_link[2].parse()?;
let n3: i32 = cap_link[3].parse()?;
let n4 = n2;
let n5: i32 = cap_link[4].parse()?;
let mixed = n2 % n3 + n4 % n5; // Calculate the link
let link = eval_js_link_calculation(&script)
.map_err(|_| Error::new(ErrorKind::Other, "Link not found: JS eval error"))?;
let dl_url = format!("{}{}{}{}", &host, url_start, mixed, url_end); let url = format!("{}{}", host, link);
Ok(url)
Ok(dl_url) }
fn preprocess_js(js_src: &str) -> String {
let mut processed_src = js_src
.replace("document.getElementById('dlbutton').href", "href")
.replace("document.getElementById('fimage')", "false")
// Fix for antiscrape 24.07.2022
.replace("document.getElementById('omg').getAttribute('class')", "2")
// Fix for antiscrape 16.08.2022
.replace("document.getElementById('dlbutton').omg", "omg");
processed_src.push_str(";href");
processed_src
}
fn eval_js_link_calculation(js_src: &str) -> ducc::Result<String> {
let ducc = Ducc::new();
ducc.exec(js_src, None, Default::default())
} }