diff --git a/composer.json b/composer.json index 5780083..ed1d717 100644 --- a/composer.json +++ b/composer.json @@ -26,13 +26,18 @@ "symfony/process": "^2.8", "symfony/dom-crawler": "^2.8", "symfony/console": "^2.8", - "doctrine/collections": "^1.3" + "doctrine/collections": "^1.3", + "monolog/monolog": "^1.22" + }, + "suggest": { + "ext-zip":"For unzipping tor zip on windows." }, "config": { "platform": { "php": "5.5" } }, + "require-dev": { "phpunit/phpunit": "^4.8", "friendsofphp/php-cs-fixer": "^2.0", diff --git a/composer.lock b/composer.lock index b2250c7..5607f12 100644 --- a/composer.lock +++ b/composer.lock @@ -4,8 +4,8 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "This file is @generated automatically" ], - "hash": "9ec1203744073948a67a86ddfbf1d1e2", - "content-hash": "83d61f647d294058c67c53ea5fab1dc9", + "hash": "7c842d90a1889fdc4f1a20870e228885", + "content-hash": "acbb85e6793692dab952a6d42f6f1618", "packages": [ { "name": "doctrine/collections", @@ -289,6 +289,84 @@ ], "time": "2016-03-26 03:44:56" }, + { + "name": "monolog/monolog", + "version": "1.22.0", + "source": { + "type": "git", + "url": "https://github.com/Seldaek/monolog.git", + "reference": "bad29cb8d18ab0315e6c477751418a82c850d558" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/bad29cb8d18ab0315e6c477751418a82c850d558", + "reference": "bad29cb8d18ab0315e6c477751418a82c850d558", + "shasum": "" + }, + "require": { + "php": ">=5.3.0", + "psr/log": "~1.0" + }, + "provide": { + "psr/log-implementation": "1.0.0" + }, + "require-dev": { + "aws/aws-sdk-php": "^2.4.9 || ^3.0", + "doctrine/couchdb": "~1.0@dev", + "graylog2/gelf-php": "~1.0", + "jakub-onderka/php-parallel-lint": "0.9", + "php-amqplib/php-amqplib": "~2.4", + "php-console/php-console": "^3.1.3", + "phpunit/phpunit": "~4.5", + "phpunit/phpunit-mock-objects": "2.3.0", + "ruflin/elastica": ">=0.90 <3.0", + "sentry/sentry": "^0.13", + "swiftmailer/swiftmailer": "~5.3" + }, + "suggest": { + "aws/aws-sdk-php": "Allow sending log messages to AWS services like DynamoDB", + "doctrine/couchdb": "Allow sending log messages to a CouchDB server", + "ext-amqp": "Allow sending log messages to an AMQP server (1.0+ required)", + "ext-mongo": "Allow sending log messages to a MongoDB server", + "graylog2/gelf-php": "Allow sending log messages to a GrayLog2 server", + "mongodb/mongodb": "Allow sending log messages to a MongoDB server via PHP Driver", + "php-amqplib/php-amqplib": "Allow sending log messages to an AMQP server using php-amqplib", + "php-console/php-console": "Allow sending log messages to Google Chrome", + "rollbar/rollbar": "Allow sending log messages to Rollbar", + "ruflin/elastica": "Allow sending log messages to an Elastic Search server", + "sentry/sentry": "Allow sending log messages to a Sentry server" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Monolog\\": "src/Monolog" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Jordi Boggiano", + "email": "j.boggiano@seld.be", + "homepage": "http://seld.be" + } + ], + "description": "Sends your logs to files, sockets, inboxes, databases and various web services", + "homepage": "http://github.com/Seldaek/monolog", + "keywords": [ + "log", + "logging", + "psr-3" + ], + "time": "2016-11-26 00:15:39" + }, { "name": "psr/http-message", "version": "1.0.1", diff --git a/example.php b/example.php new file mode 100644 index 0000000..520c7b6 --- /dev/null +++ b/example.php @@ -0,0 +1,25 @@ +get('http://32b5oz2bbtn6gqj3.onion/index.php/Main_Page'); +$client->get('http://4sy6ebszykvcv2n6.onion/'); + +//$client->get('https://de.indymedia.org/index.shtml'); + +$images = $client->images(); +//dump($images); +foreach($images as $image) { + + $result[] = $image->toArray(); +} +dump($result); +dump(json_encode(serialize($result))); +file_put_contents('data.json', json_encode($result, JSON_PRETTY_PRINT)); diff --git a/src/Sikofitt/Tor/Collection/ImageCollection.php b/src/Sikofitt/Tor/Collection/ImageCollection.php index 0b1979d..1864c95 100644 --- a/src/Sikofitt/Tor/Collection/ImageCollection.php +++ b/src/Sikofitt/Tor/Collection/ImageCollection.php @@ -9,28 +9,145 @@ namespace Sikofitt\Tor\Collection; use Doctrine\Common\Collections\ArrayCollection; +use GuzzleHttp\ClientInterface; +use GuzzleHttp\Psr7\Uri; +use Monolog\Handler\StreamHandler; +use Monolog\Logger; +use Sikofitt\Tor\TorClient; use Symfony\Component\DomCrawler\Crawler; class ImageCollection { + /** + * @var Uri + * The Uri we are collecting images from + */ + private $uri; + /** + * @var string + * The html we are parsing + */ private $html; + + /** + * @var ArrayCollection + * A container for the found images. + */ private $images; + + /** + * @var Crawler + * Our dom crawler + */ private $crawler; - public function __construct($html) + /** + * @var ClientInterface + * Our http client + */ + private $client; + + /** + * @var Logger + * Our logger + */ + private $logger; + + /** + * @var int + * The timeout between image requests. + */ + private $timeout; + + /** + * ImageCollection constructor. + * @param $uri + * @param $html + * @param ClientInterface $client + * @param int $timeout + */ + public function __construct($uri, $html, ClientInterface $client, $timeout = 10) { + $this->timeout = $timeout; + $this->logger = new Logger('tor-spider-ImageCollection'); + $this->logger->pushHandler(new StreamHandler('php://stdout', Logger::DEBUG)); + $this->client = $client; $this->html = $html; + $this->uri = new Uri($uri); $this->images = new ArrayCollection(); $this->crawler = new Crawler(); + + } + private function sleep() { + $this->logger->debug('sleeping for ' . $this->timeout . ' seconds.'); + for($i=$this->timeout;$i>=1;$i--) + { + + if(true === in_array($i, array(ceil($this->timeout/2), 2, 1))) { + $this->logger->debug($i . ' Seconds ...'); + } + sleep(1); + } + $this->logger->debug('Continuing ...'); } + /** + * @return ArrayCollection + */ public function images() { + $this->crawler->addHtmlContent($this->html); $images = $this->crawler->filter('img'); + $sources = []; foreach($images as $image) { - $this->images->add(new ArrayCollection($image->attributes)); + if(in_array($image->getAttribute('src'), array_values($sources))) + { + $this->logger->debug('Aready have source skipping ...', array('url' => $image->getAttribute('src'))); + continue; + } + $sources[] = $image->getAttribute('src'); + + $alt = $image->getAttribute('alt'); + if(0 === strlen($alt)) { + $this->logger->debug('Setting alt to image src', array('src' => $image->getAttribute('src'))); + $alt = $image->getAttribute('src'); + } else { + $this->logger->debug('Found alt ... was ' . $alt); + } + + $uri = new Uri($image->getAttribute('src')); + + if(0 === strlen($uri->getScheme())) { + $uri = $uri->withScheme($this->uri->getScheme()); + } + if(0 === strlen($uri->getHost())) { + $uri = $uri->withHost($this->uri->getHost()); + } + try { + $request = $this->client->get(sprintf('%s://%s/%s', + $uri->getScheme(), $uri->getHost(), + ltrim($uri->getPath(), '/'))); + if(404 === $request->getStatusCode()) { + $this->logger->debug('Got 404 skipping ... ', array('url' => sprintf('%s://%s/%s', + $uri->getScheme(), $uri->getHost(), + ltrim($uri->getPath(), '/')))); + continue; + } + $raw = $request->getBody()->getContents(); + } catch(\Exception $e) { + print $e->getMessage(); + $this->logger->error($e->getMessage(), $e->getTrace()); + continue; + } + $result['alt'] = $alt; + $result['fqd'] = sprintf('%s://%s/%s', $uri->getScheme(), $uri->getHost(), ltrim($uri->getPath(), '/')); + $result['src'] = $image->getAttribute('src'); + $result['raw'] = $raw; + $result['base64'] = base64_encode($raw); + $this->images->set($alt, $result); + $this->sleep(); } return $this->images; } diff --git a/src/Sikofitt/Tor/TorClient.php b/src/Sikofitt/Tor/TorClient.php index 4ce2e18..1ba05cc 100644 --- a/src/Sikofitt/Tor/TorClient.php +++ b/src/Sikofitt/Tor/TorClient.php @@ -9,11 +9,16 @@ namespace Sikofitt\Tor; +use Doctrine\Common\Collections\ArrayCollection; use GuzzleHttp\Client; use GuzzleHttp\ClientInterface; use GuzzleHttp\Handler\CurlMultiHandler; use GuzzleHttp\HandlerStack; +use GuzzleHttp\Psr7\Stream; use GuzzleTor\Middleware; +use Monolog\Handler\StreamHandler; +use Monolog\Handler\SyslogHandler; +use Monolog\Logger; use Sikofitt\Tor\Collection\ImageCollection; use Sikofitt\Tor\Exception\BadProxyUrlException; @@ -22,7 +27,7 @@ use Sikofitt\Tor\Exception\BadProxyUrlException; * * @package Sikofitt\Tor */ -class TorClient extends Client implements ClientInterface +class TorClient { /** @@ -49,19 +54,27 @@ class TorClient extends Client implements ClientInterface * @var Middleware */ private $middleware; - private $poolData; + private $htmlData; + private $images; + private $logger; + public function __construct( $proxy = '127.0.0.1:9050', $torControl = '127.0.0.1:9051' ) { + $this->logger = new Logger('tor-spider'); + $this->logger->pushHandler(new StreamHandler('php://stdout', Logger::DEBUG)); + $this->logger->pushHandler(new SyslogHandler('tor-spider'), Logger::DEBUG); $this->proxy = $proxy; $this->torControl = $torControl; + $this->htmlData = new ArrayCollection(); + $this->logger->debug('Setting up tor handler'); $this->setTorMiddleWare(); $this->createHandlerStack(); $this->setClient(); - + $this->logger->debug('Finished'); } /** @@ -69,7 +82,9 @@ class TorClient extends Client implements ClientInterface */ public function setTorMiddleWare() { + $this->logger->debug('Setting up tor middleware.'); $this->middleware = Middleware::tor($this->proxy, $this->torControl); + $this->logger->debug('Finished'); return $this; } @@ -96,12 +111,33 @@ class TorClient extends Client implements ClientInterface $this->client = new Client([ 'verify' => false, 'handler' => $this->handlerStack, + 'allow_redirects' => true, + 'max_redirects' => 20, ]); - return $this; } + + public function get($uris, array $options = []) + { + if(is_array($uris) || is_object($uris)) + { + $this->htmlData = $this->pool($uris); + } else { + try { + $this->logger->debug('Requesting ' . $uris . ' ...'); + $response = $this->client->get($uris, $options); + $this->htmlData->set($uris, + $response->getBody()->getContents()); + $this->logger->debug('html retrieved', array('uri' => $uris)); + } catch(\Exception $e) { + $this->logger->error($e->getMessage(), $e->getTrace()); + return null; + } + } + + } public function setImages($images) { $this->images = $images; @@ -200,10 +236,16 @@ class TorClient extends Client implements ClientInterface return $this; } - public function images($html) + public function images() { - $images = new ImageCollection($html); - return $this; + + foreach($this->htmlData as $uri => $html) + { $image = new ImageCollection($uri, $html, $this->client, 5); + + $this->images[] = $image->images(); + } + + return $this->images; } } \ No newline at end of file diff --git a/tor-client/bin/tor b/tor-client/bin/tor index 639f523..c4fd068 100755 Binary files a/tor-client/bin/tor and b/tor-client/bin/tor differ diff --git a/tor-client/bin/tor-gencert b/tor-client/bin/tor-gencert index cbfb616..96df1e7 100755 Binary files a/tor-client/bin/tor-gencert and b/tor-client/bin/tor-gencert differ diff --git a/tor-client/bin/tor-resolve b/tor-client/bin/tor-resolve index 7131b86..5e2f084 100755 Binary files a/tor-client/bin/tor-resolve and b/tor-client/bin/tor-resolve differ diff --git a/tor-client/etc/tor/torrc.sample b/tor-client/etc/tor/torrc.sample index c2f460e..d853e03 100644 --- a/tor-client/etc/tor/torrc.sample +++ b/tor-client/etc/tor/torrc.sample @@ -34,10 +34,10 @@ ## We advise using "notice" in most cases, since anything more verbose ## may provide sensitive information to an attacker who obtains the logs. ## -## Send all messages of level 'notice' or higher to /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/log/tor/notices.log -#Log notice file /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/log/tor/notices.log -## Send every possible message to /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/log/tor/debug.log -#Log debug file /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/log/tor/debug.log +## Send all messages of level 'notice' or higher to /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/log/tor/notices.log +#Log notice file /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/log/tor/notices.log +## Send every possible message to /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/log/tor/debug.log +#Log debug file /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/log/tor/debug.log ## Use the system log instead of Tor's logfiles #Log notice syslog ## To send all messages to stderr: @@ -50,7 +50,7 @@ ## The directory for keeping all the keys/etc. By default, we store ## things in $HOME/.tor on Unix, and in Application Data\tor on Windows. -#DataDirectory /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor +#DataDirectory /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor ## The port on which Tor will listen for local connections from Tor ## controller applications, as documented in control-spec.txt. @@ -69,10 +69,10 @@ ## HiddenServicePort x y:z says to redirect requests on port x to the ## address y:z. -#HiddenServiceDir /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor/hidden_service/ +#HiddenServiceDir /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor/hidden_service/ #HiddenServicePort 80 127.0.0.1:80 -#HiddenServiceDir /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor/other_hidden_service/ +#HiddenServiceDir /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor/other_hidden_service/ #HiddenServicePort 80 127.0.0.1:80 #HiddenServicePort 22 127.0.0.1:22 @@ -147,7 +147,7 @@ ## can explain what Tor is if anybody wonders why your IP address is ## contacting them. See contrib/tor-exit-notice.html in Tor's source ## distribution for a sample. -#DirPortFrontPage /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/tor-exit-notice.html +#DirPortFrontPage /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/tor-exit-notice.html ## Uncomment this if you run more than one Tor relay, and add the identity ## key fingerprint of each Tor relay you control, even if they're on diff --git a/tor-client/share/doc/tor/tor.html b/tor-client/share/doc/tor/tor.html index 2f0506d..666b862 100644 --- a/tor-client/share/doc/tor/tor.html +++ b/tor-client/share/doc/tor/tor.html @@ -786,7 +786,7 @@ Project’s website.

Specify a new configuration file to contain further Tor configuration options OR pass - to make Tor read its configuration from standard - input. (Default: /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc, or $HOME/.torrc if that file is not + input. (Default: /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc, or $HOME/.torrc if that file is not found)

@@ -807,7 +807,7 @@ Project’s website.

Specify a file in which to find default values for Tor options. The contents of this file are overridden by those in the regular configuration file, and by those on the command line. (Default: - /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc-defaults.) + /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc-defaults.)

@@ -1374,7 +1374,7 @@ forward slash (/) in the configuration file and on the command line.

- Store working data in DIR (Default: /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor) + Store working data in DIR (Default: /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor)

@@ -5248,7 +5248,7 @@ TestingEnableTbEmptyEvent 1
-/home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc +/home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc

@@ -5260,11 +5260,11 @@ TestingEnableTbEmptyEvent 1

- Fallback location for torrc, if /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc is not found. + Fallback location for torrc, if /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc is not found.

-/home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor/ +/home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor/

diff --git a/tor-client/share/man/man1/tor.1 b/tor-client/share/man/man1/tor.1 index da52554..efd2cec 100644 --- a/tor-client/share/man/man1/tor.1 +++ b/tor-client/share/man/man1/tor.1 @@ -50,7 +50,7 @@ Display a short help message and exit\&. .RS 4 Specify a new configuration file to contain further Tor configuration options OR pass \fB\-\fR -to make Tor read its configuration from standard input\&. (Default: /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc, or $HOME/\&.torrc if that file is not found) +to make Tor read its configuration from standard input\&. (Default: /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc, or $HOME/\&.torrc if that file is not found) .RE .PP \fB\-\-allow\-missing\-torrc\fR @@ -62,7 +62,7 @@ exist if default torrc can be accessed\&. .PP \fB\-\-defaults\-torrc\fR \fIFILE\fR .RS 4 -Specify a file in which to find default values for Tor options\&. The contents of this file are overridden by those in the regular configuration file, and by those on the command line\&. (Default: /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc\-defaults\&.) +Specify a file in which to find default values for Tor options\&. The contents of this file are overridden by those in the regular configuration file, and by those on the command line\&. (Default: /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc\-defaults\&.) .RE .PP \fB\-\-ignore\-missing\-torrc\fR @@ -373,7 +373,7 @@ If this option is set to 0, don\(cqt allow the filesystem group to read the cont .PP \fBDataDirectory\fR \fIDIR\fR .RS 4 -Store working data in DIR (Default: /home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor) +Store working data in DIR (Default: /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor) .RE .PP \fBDataDirectoryGroupReadable\fR \fB0\fR|\fB1\fR @@ -2547,17 +2547,17 @@ If this signal exists on your platform, Tor catches and ignores it\&. .RE .SH "FILES" .PP -\fB/home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc\fR +\fB/home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc\fR .RS 4 The configuration file, which contains "option value" pairs\&. .RE .PP \fB$HOME/\&.torrc\fR .RS 4 -Fallback location for torrc, if /home/eric/projects/tor/tor-0.2.9.8/../tor-client/etc/tor/torrc is not found\&. +Fallback location for torrc, if /home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/etc/tor/torrc is not found\&. .RE .PP -\fB/home/eric/projects/tor/tor-0.2.9.8/../tor-client/var/lib/tor/\fR +\fB/home/eric/projects/php/tor-spider/tor-0.2.9.8/../tor-client/var/lib/tor/\fR .RS 4 The tor process stores keys and other data here\&. .RE