forked from dtbaker/envato-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
class.envato_scraper.php
614 lines (532 loc) · 25.6 KB
/
class.envato_scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
<?php
define('_ENVATO_DEBUG_MODE',false);
define('_ENVATO_TMP_DIR',dirname(__FILE__).'/envato-cache/');
define('_ENVATO_SECRET',"rhrh65zre6her6j42l3i4u2j34k2134nlkj2h42kjgasf"); // some unique code
class envato_scraper{
private $waiting_on_recaptcha = false;
private $logged_in = false;
private $username = false;
// list of all supported marketplaces.
private $marketplaces = array(
"http://themeforest.net",
"http://codecanyon.net",
"http://activeden.net",
"http://audiojungle.net",
"http://videohive.net",
"http://graphicriver.net",
"http://3docean.net",
"http://photodune.net",
);
private $authed_marketplaces=array();// which ones we have /sign_in?auto=true&to=X to
private $authenticity_tokens=array();
private $main_marketplace = 'http://themeforest.net';
public function __construct($main_marketplace='http://themeforest.net'){
if(in_array($main_marketplace,$this->marketplaces)){
$this->main_marketplace = $main_marketplace;
}
if(!is_dir(_ENVATO_TMP_DIR) || !is_writable(_ENVATO_TMP_DIR)){
echo 'please make sure the temp directory '._ENVATO_TMP_DIR.' is writable by PHP scripts.';
}
}
/**
* This pulls back list of all user items across all marketplaces (or specified marketplace)
*
* @param $user
* @param array $from_marketplaces
* @return array of items
*/
public function get_users_items($user,$from_marketplaces=array()){
$files = array();
if(!is_array($from_marketplaces))$from_marketplaces=array($from_marketplaces);
foreach($from_marketplaces as $marketplace){
//http://marketplace.envato.com/api/v2/new-files-from-user:collis,themeforest.json
$url = "http://marketplace.envato.com/api/v2/new-files-from-user:$user,$marketplace.json";
if(_ENVATO_DEBUG_MODE){
echo " Grabbing API url: $url <bR>\n";
}
$data = $this->_get_url($url,array(),false);
if(!empty($data)) {
$json_data = json_decode($data, true);
if(_ENVATO_DEBUG_MODE){
echo "data: ";print_r($json_data);
}
$files = array_merge($files,$json_data['new-files-from-user']);
}
}
return $files;
}
public function authenticate_marketplace($url){
if(!in_array($url,$this->marketplaces))return false;
$marketplace_tag = str_replace('.net','',str_replace('http://','',$url));
if(isset($this->authed_marketplaces[$marketplace_tag])){
$this->logged_in = true;
}else{
$auth_check = $this->_get_url('https://account.envato.com/sign_in?auto=true&to='.$marketplace_tag,array(),true); // todo - force this one?
preg_match('#name="authenticity_token" type="hidden" value="([^"]+)"#',$auth_check ,$matches);
$authenticity_token = $matches[1];
preg_match('#name="token" type="hidden" value="([^"]+)"#',$auth_check ,$matches);
$token = $matches[1];
$post = array(
'utf8' => '✓',
'authenticity_token' => $authenticity_token,
'token' => $token
);
//echo "<pre>";
//print_r($post);
//echo $marketplace_tag;
//echo "</pre>";
$auth_check = $this->_get_url('http://'.$marketplace_tag.'.net/sso/verify_token', $post, true);
if(preg_match('#/sign_out["\?]#',$auth_check)){
$this->authed_marketplaces[$marketplace_tag]=true;
preg_match('#<meta content="([^"]+)" name="csrf-token" />#', $auth_check, $hits);
$this->authenticity_tokens[$marketplace_tag]=$hits[1];
return true;
}
}
return false;
}
/**
* do_login! The magic method that logs you into envato marketplaces. yew!
* Even supports recaptcha if you're loading this script from a web browser :P
*
* @param string $username
* @param string $password
* @param int $try_number
* @param bool $data
* @return bool|int
*/
public function do_login($username,$password,$try_number=0,$data=false){
if($this->logged_in)return true;
$this->username = $username;
if($this->waiting_on_recaptcha){
echo 'Waiting on recaptcha or manual password entry. Run script from browser.';
return false;
}
if(!$data){
$data = $this->_clean($this->_get_url($this->main_marketplace.'/forums',array(),true));
}
// check if we are logged in or not.
// simply look for the string logout and Log Out
if($try_number>1){
// TODO: handle if envato is down for maintenance
echo "Unable to login. Sorry, please try again shortly.";
return false;
}else if(preg_match('#/sign_out["\?]#',$data)){
// if sign_out is present on the page then we are logged in
// new redirect hack with new account centre setup
$this->logged_in = $this->authenticate_marketplace($this->main_marketplace);
}else if($username){
$data = $this->_get_url('https://account.envato.com');
$auth_token = '';
if(preg_match('#name="authenticity_token" type="hidden" value="([^"]+)"#',$data,$matches)){
$auth_token = $matches[1];
if($auth_token){
if(isset($_POST['envatopassword'.md5($this->main_marketplace)])){
$password = $_POST['envatopassword'.md5($this->main_marketplace)];
}
if(!$password){
// prompt for password
$this->waiting_on_recaptcha=true; //re-use this feature from the captcha thingey.
?>
<br>
<form action="" method="post">
Enter Envato Password for account "<?php echo $username;?>": <input type="text" name="envatopassword<?php echo md5($this->main_marketplace);?>"> <br>
Enter Envato Two-Factor for account "<?php echo $username;?>" (optional): <input type="text" name="envatopasswordtwofactor<?php echo md5($this->main_marketplace);?>"> <br>
<input type="submit" name="go" value="Submit">
</form>
<?php
return 0;
}
$post_data = array(
"username"=>$username,
"password"=>$password,
"authenticity_token" => $auth_token,
"utf8" => '✓',
"commit" => 'Sign In',
//"from_header_bar"=>"true",
);
if(isset($_REQUEST['envatopasswordtwofactor'.md5($this->main_marketplace)])){
$post_data['authentication_code'] = $_REQUEST['envatopasswordtwofactor'.md5($this->main_marketplace)];
}
if(isset($_POST['recaptcha'.md5($this->main_marketplace)])){
$post_data["recaptcha_challenge_field"]=$_POST['recaptcha'.md5($this->main_marketplace)];
$post_data["recaptcha_response_field"]='manual_challenge';
unset($_POST['recaptcha'.md5($this->main_marketplace)]);
}
if(_ENVATO_DEBUG_MODE){
echo "Login attempt $try_number with username: ".$username." <br> ";
}
$url = "https://account.envato.com/sign_in";
if($_POST['go'] == 'Submit'){ $data = $this->_get_url($url,$post_data,true);} else {
$data = $this->_get_url($url,$post_data,true);
}
if(_ENVATO_DEBUG_MODE){
file_put_contents(_ENVATO_TMP_DIR."debug-envato_login-".$try_number.".html",$data);
echo "Saved LOGIN ATTEMPT file at: "._ENVATO_TMP_DIR."debug-envato_login-".$try_number.".html <br>";
}
if(preg_match('#temporarily locked out#',$data)){
echo "Sorry, temporarily locked out for too many failed login attempts.";
return 0;
}else if (preg_match('#recaptcha/api/noscript#',$data)){
$this->waiting_on_recaptcha=true;
echo "Sorry, too many failed envato login attempts on ".$this->main_marketplace.". Please enter the re-captcha code below. <br>";
// <iframe src="https://www.google.com/recaptcha/api/noscript?k=6LeL1wUAAAAAAJ6M4Rd6GzH86I_9_snNaLPqy_ff" h
if(preg_match('#<iframe src="https://www.google.com/recaptcha/api/noscript[^"]*"[^>]*>#',$data,$matches)){
echo $matches[0].'</iframe>';
?>
<br>
<form action="" method="post">
Enter Code: <input type="text" name="recaptcha<?php echo md5($this->main_marketplace);?>"> <input type="submit" name="go" value="Submit">
<?php foreach($_POST as $key=>$val){
if(strpos($key,'recaptcha')!==false || strpos($key,'envatopassword')!==false){
?>
<input type="hidden" name="<?php echo $key;?>" value="<?php echo $val;?>">
<?php
}
} ?>
</form>
<?php
}
return 0;
}
}else{
echo 'failed: no auth token';
}
}else{
echo 'failed. no auth token found on page';
}
return $this->do_login($username,$password,$try_number+1,$data);
}else {
// no username or password, set, return false so we prompt them to login.
return false;
}
// $data now contains our home page in logged in version.
// how much cash do we have?
//<span class="user_balance">$4,829.40</span>
/*if(preg_match('#class="user_balance">\$([^<]+)<#',$data,$matches)){
print_r($matches);
$this->account_balance = preg_replace('/[^\.\d]/','',$matches[1]);
}*/
return $this->logged_in;
}
/**
*
* This method will return an array of purchased items.
*
* @param string $url the url from your email e.g. http://codecanyon.net/user/USERNAME?pm_key=OTgxMjYx%0B
*
* @return array
*/
public function verify_email_link($url){
$urlparts = parse_url($url);
$purchases = array();
//login always on the main marketplace
$data = $this->_clean($this->_get_url($this->main_marketplace.$urlparts['path'].'?'.$urlparts['query']));
//if we found some purchased files
if(preg_match('#<h2 class="underlined">Purchases of your files</h2> <ul class="fancy-list">#s', $data)){
//grab them and put them in an array
preg_match('#<ul class="fancy-list">(.*)(days?|months?|years?) ago<\/li>(.*)?<\/ul>#s', $data, $hits);
$raw = explode('<br>', strip_tags(str_replace('</li>', '</li><br>', $hits[0]), '<a><br>'));
foreach($raw as $purchase){
preg_match('#href="([^"]+)\/(\d+)"#', $purchase, $hits);
if(empty($hits[2])) continue;
//get time
preg_match('#(\d+) (days?|months?|years?) ago#', $purchase, $time);
//get license
preg_match('#(Regular|Extended) License#', $purchase, $license);
$purchases[] = array(
'item_id' => $hits[2],
'item_url' => str_replace(array('href="', '"'), array('http://'.$urlparts['host'], ''), $hits[0]),
'item_name' => trim(str_replace(array($license[0], $time[0]), '', strip_tags($purchase))),
'url' => $url,
'text' => trim(strip_tags($purchase)),
'license' => $license[1],
'date' => date('Y-m-d', strtotime('- '.$time[1].' '.$time[2])),
);
}
if(_ENVATO_DEBUG_MODE){
echo 'found purchases in '.$url;
print_r($purchases);
}
}
return $purchases;
}
/**
*
* This method will post a comment. Requires the item id and the comment id of the starting comment
*
* @param string $url the url from your email e.g. http://codecanyon.net/user/USERNAME?pm_key=OTgxMjYx%0B
*
* @return array
*/
public function post_comment($item_id, $comment_id, $message){
$authenticity_token = $this->get_authenticity_token();
if(!$authenticity_token) return false;
$post = array(
'utf8' => '✓',
'authenticity_token' => $authenticity_token,
'parent_id' => $comment_id,
'ret' => 'author_dashboard',
'content' => $message,
);
$result = $this->_get_url($this->main_marketplace.'/item/goto/'.$item_id.'/comments', $post, false);
return preg_match('#<div class="notice flash">(\s*)<p>Your reply was added<\/p>(\s*)<\/div>#', $result);
}
/**
*
* This method will return the CSV statement for Envato earnings.
* Useful for manual calculations within your own system.
* eg: a system that automatically calculates split earnings on collaboration items.
*
* @param bool|string $datefrom
* @param bool|string $dateto
*
* @return array
*/
public function get_statement($datefrom,$dateto=false){
//if(!$this->logged_in || !$this->username || !$datefrom)return array();
$items = array();
$current_month = date('n');
$current_year = date('Y');
// work out what dates we need to grab from the statement.
list($from_month,$from_year) = explode('/',$datefrom);
$statement_url_requests = array();
if($from_year<=$current_year && (
($from_year==$current_year && $from_month <= $current_month) ||
($from_year<$current_year)
)){
// we have a valid from date! do the loop.
$xm = $from_month;
for($xy=$from_year;$xy<=$current_year;$xy++){
while(
$xm <= 12 && (
($xy==$current_year && $xm<=$current_month) ||
($xy<$current_year)
)
){
//$statement_url_requests[] = $this->main_marketplace . "/user/".$this->username."/download_statement_as_csv?month=".$xm.'&year='.$xy;
$statement_url_requests[] = $this->main_marketplace . "/statement/".$xy.'-'.$xm.'.csv';
$xm++;
}
if($xm>12){
$xm=1;
}
}
}
if(_ENVATO_DEBUG_MODE){
echo 'grabbing these statement urls:';
print_r($statement_url_requests);
}
foreach($statement_url_requests as $url){
if(strpos($url,$current_year.'-'.$current_month)){
// we always grab a new copy of the latest months statement:
// any previous months we always use the cached version if they exist.
$data = $this->_get_url($url,array(),true);
}else{
// fall back to cache.
$data = $this->_get_url($url,array(),false);
if(preg_match('#<html#',$data) && $this->_got_url_from_cache){
// we got a cached html file, try again without cache mode just for kicks.
$data = $this->_get_url($url,array(),true);
}
}
if(preg_match('#<html#',$data)){
//echo 'failed, probably not logged in correctly, invalid month or envato is temporarily down.';
//return array();
}
// save as temp file and use fgetcsv
// dont want to use str_getcsv because it requires 5.3 and some people are still on 5.2.
$temp_csv_file = _ENVATO_TMP_DIR.'envato_'.basename($this->username)."_statement-csv-current.csv";
file_put_contents($temp_csv_file,$data);
if($temp_csv_file && is_file($temp_csv_file)){
$fd = fopen($temp_csv_file,"r");
$count = 1;
while (($data = fgetcsv($fd, 1000, ",")) !== FALSE) {
if(1 == $count){
$count++;
continue;
} // dont save header.
if(count($data)<2)continue;
$items[]=$data;
$count++;
}
if(_ENVATO_DEBUG_MODE){
echo "Month: $temp_csv_file got $count items<br>";
}
fclose($fd);
}
}
foreach($items as &$foo){
$item_name = str_replace('"','',$foo[2]);
$item_id = (int)str_replace('"','',$foo[3]);
$item_type = str_replace('"','',trim($foo[1]));
$item_amount = 0;
$item_rate = str_replace('%','',$foo[5]);
$earnt = str_replace('"','',$foo[4]);
if($item_type == 'sale'){
// support the old method of logging stuff:
if(preg_match('/sold (.*) for (\d.*) w\/ rate of (.*)%/U',str_replace('"','',$foo[3]),$matches)){
$item_name = $matches[1];
$item_amount = $matches[2];
$item_rate = $matches[3];
$earnt = str_replace('"','',$foo[2]);
}else{
$item_name = $foo[2];
$item_id = (int)$foo[3];
$item_amount = $foo[6];
$item_rate = str_replace('%','',$foo[5]);
$earnt = str_replace('"','',$foo[4]);
}
}
$line = array(
"type" => $item_type,
"date" => trim($foo[0]),
"time" => strtotime(trim($foo[0])),
"item" => $item_name,
"item_id" => $item_id,
'envato_item_id'=>0, // database id.
"earnt" => $earnt,
"amount" => $item_amount,
"rate" => $item_rate,
);
$foo = $line;
}
if(_ENVATO_DEBUG_MODE){
echo "There are ".count($items)." lines in your statement CSV file. Is this correct? <br>";
}
return $items;
}
/**
*
* This method will return the recent reviews for your Envato items.
* $datefrom can be 2014-01-29 (about the time reviews started having usernames)
* $type can be 1 for ratings and reviews, or 2 for just reviews
* $item_id can be to limit results to a particular item
*
* @param bool|string $datefrom
* @param bool|int $type
* @param bool|int $item_id
*
* @return array
*/
public function get_reviews(){ // todo: $datefrom=false,$type=1,$item_id=false
if(!$this->logged_in)return array();
$reviews = array();
if(_ENVATO_DEBUG_MODE){
echo 'grabbing reviews...';
}
$page_number = 1;
while(true){
$data = $this->_get_url( $this->main_marketplace . "/reviews?page=".$page_number);
if(!$data || strpos($data,'Page Not Found'))break;
$data = preg_replace('#\s+#',' ',$data);
if(preg_match_all('#id="review_\d+".*<div class="review__details"> (.*) on <a href="(/item/[^/]+/)reviews/(\d+)"[^>]+>([^<]+)</a>.*<a href="/ratings/(\d+)"[^>]+>([^<]+)</a>.*</div> </div> <div class="(review|page-controls)"#imsU',$data,$matches)){
foreach($matches[0] as $match_id => $match){
$this_review = array(
'rating_id' => $matches[5][$match_id],
'rating_url' => '/ratings/'.$matches[5][$match_id],
'buyer' => strip_tags($matches[1][$match_id]),
'stars' => '',
'review' => '',
'item_id' => $matches[3][$match_id],
'item_name' => $matches[4][$match_id],
'item_url' => $matches[2][$match_id].$matches[3][$match_id],
'date' => $matches[6][$match_id],
'date_estimate' => date('Y-m-d',strtotime('-'.str_replace(' ago','',$matches[6][$match_id]),time())),
);
if(preg_match_all('#alt="Star-on" class="rating-basic#',$match,$stars)){
$this_review['stars'] = count($stars[0]);
}
if(preg_match('#<div class="review__comments"> <p> <strong>Extra comments from the buyer:</strong><br />(.*)</p>#imsU',$match,$comments)){
$this_review['review'] = trim($comments[1]);
}
$reviews[] = $this_review;
}
}else{
echo 'no matches';
}
$page_number++;
}
return $reviews;
}
/**
* This method handles all the remote URL gets, and caching.
*
* @param string $url Url to get: eg http://themeforest.net/user/dtbaker
* @param array $post Any post data to send (eg: login details)
* @param bool $force Force it to refresh, aka: dont read from cache.
* @return string HTML data that came back from request.
*/
private $_got_url_from_cache = false;
function _get_url($url,$post=array(),$force=false){
$cache_key = md5(_ENVATO_SECRET . $url . serialize($post));
$data = ($force) ? false : $this->_get_cache($cache_key);
if(!$data){
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
//curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_HEADER, _ENVATO_DEBUG_MODE); // debug
curl_setopt($ch, CURLINFO_HEADER_OUT, _ENVATO_DEBUG_MODE); // debug
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
$cookies = _ENVATO_TMP_DIR.'cookie-'.md5(_ENVATO_SECRET.$this->username.__FILE__);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookies);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookies);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_USERAGENT, "EnvatoScraper/1.0 (compatible;)");
curl_setopt($ch, CURLOPT_VERBOSE, 0);
if($post){
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
}
$data = curl_exec($ch);
//echo "<br />".$last_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL)."<br />";
if(_ENVATO_DEBUG_MODE){
$headers = curl_getinfo($ch, CURLINFO_HEADER_OUT);
echo '<hr>headers for url '.$url.'<br>';var_dump($headers);echo '<hr>';
file_put_contents(_ENVATO_TMP_DIR."envato_request-".preg_replace('#[^a-z]#','',$url).".html",$data);
if(preg_match('#Not Allowed#',$data)){
echo "Failed with nginx not allowed on request $url with post data:<br>"; print_r($post);
}
}
$this->_save_cache($cache_key,$data);
$this->_got_url_from_cache=false;
}else{
$this->_got_url_from_cache=true;
}
return $data;
}
/** caching so we don't hit envato too much **/
private function _get_cache($key){
if(is_file(_ENVATO_TMP_DIR.'cache-'.basename($key))){
return @unserialize(file_get_contents(_ENVATO_TMP_DIR.'cache-'.basename($key)));
}
return false;
}
private function _save_cache($key,$data){
file_put_contents(_ENVATO_TMP_DIR.'cache-'.basename($key),serialize($data));
return true;
}
// wack everything on 1 line for easier regex scraping
private function _clean($data){
$data = preg_replace("/\r|\n/","",$data);
$data = preg_replace("/\s+/"," ",$data);
return $data;
}
/**
*
* This method will return the current authenticity_token of the given marketplace.
*
* @param string $marketplace
*
* @return token
*/
private function get_authenticity_token($marketplace = ''){
if(empty($marketplace)) $marketplace = $this->main_marketplace;
$marketplace_tag = str_replace('.net','',str_replace('http://','',$marketplace));
return isset($this->authenticity_tokens[$marketplace_tag]) ? $this->authenticity_tokens[$marketplace_tag] : false;
}
}