2012-05-28 22:27:18 +04:00
|
|
|
<?php
|
|
|
|
/**
|
2016-07-23 22:32:42 +03:00
|
|
|
* Nextcloud - News
|
2014-04-19 20:16:55 +04:00
|
|
|
*
|
|
|
|
* This file is licensed under the Affero General Public License version 3 or
|
|
|
|
* later. See the COPYING file.
|
|
|
|
*
|
|
|
|
* @author Alessandro Cosentino <cosenal@gmail.com>
|
|
|
|
* @author Bernhard Posselt <dev@bernhard-posselt.com>
|
|
|
|
* @copyright Alessandro Cosentino 2012
|
|
|
|
* @copyright Bernhard Posselt 2012, 2014
|
|
|
|
*/
|
2012-05-28 22:27:18 +04:00
|
|
|
|
2013-09-27 22:03:00 +04:00
|
|
|
namespace OCA\News\Fetcher;
|
2012-08-11 20:19:37 +04:00
|
|
|
|
2016-03-26 22:18:19 +03:00
|
|
|
use Exception;
|
|
|
|
|
2017-01-04 13:10:19 +03:00
|
|
|
use OCA\News\PostProcessor\LWNProcessor;
|
|
|
|
use OCP\Http\Client\IClientService;
|
2015-08-10 21:20:30 +03:00
|
|
|
use PicoFeed\Parser\MalFormedXmlException;
|
|
|
|
use PicoFeed\Reader\Reader;
|
2015-08-26 14:03:55 +03:00
|
|
|
use PicoFeed\Parser\Parser;
|
2015-08-10 21:20:30 +03:00
|
|
|
use PicoFeed\Reader\SubscriptionNotFoundException;
|
|
|
|
use PicoFeed\Reader\UnsupportedFeedFormatException;
|
|
|
|
use PicoFeed\Client\InvalidCertificateException;
|
|
|
|
use PicoFeed\Client\InvalidUrlException;
|
|
|
|
use PicoFeed\Client\MaxRedirectException;
|
|
|
|
use PicoFeed\Client\MaxSizeException;
|
|
|
|
use PicoFeed\Client\TimeoutException;
|
2016-03-25 17:31:17 +03:00
|
|
|
use PicoFeed\Client\ForbiddenException;
|
|
|
|
use PicoFeed\Client\UnauthorizedException;
|
2015-08-10 21:20:30 +03:00
|
|
|
|
|
|
|
use OCP\IL10N;
|
|
|
|
|
|
|
|
use OCA\News\Db\Item;
|
|
|
|
use OCA\News\Db\Feed;
|
|
|
|
use OCA\News\Utility\PicoFeedFaviconFactory;
|
|
|
|
use OCA\News\Utility\PicoFeedReaderFactory;
|
2016-07-23 19:34:17 +03:00
|
|
|
use OCA\News\Utility\Time;
|
2012-07-03 07:39:19 +04:00
|
|
|
|
2013-04-02 13:09:33 +04:00
|
|
|
class FeedFetcher implements IFeedFetcher {
|
2012-08-16 23:34:41 +04:00
|
|
|
|
2014-10-23 00:19:14 +04:00
|
|
|
private $faviconFactory;
|
2014-11-05 13:30:27 +03:00
|
|
|
private $reader;
|
|
|
|
private $l10n;
|
2014-10-21 18:45:36 +04:00
|
|
|
private $time;
|
2017-01-04 13:10:19 +03:00
|
|
|
private $clientService;
|
2014-10-22 03:35:59 +04:00
|
|
|
|
2014-11-05 13:30:27 +03:00
|
|
|
public function __construct(Reader $reader,
|
2014-10-23 00:19:14 +04:00
|
|
|
PicoFeedFaviconFactory $faviconFactory,
|
2014-11-05 13:30:27 +03:00
|
|
|
IL10N $l10n,
|
2017-01-04 13:10:19 +03:00
|
|
|
Time $time,
|
|
|
|
IClientService $clientService) {
|
2014-10-23 00:19:14 +04:00
|
|
|
$this->faviconFactory = $faviconFactory;
|
2014-11-05 13:30:27 +03:00
|
|
|
$this->reader = $reader;
|
2014-10-21 18:45:36 +04:00
|
|
|
$this->time = $time;
|
2014-11-05 13:30:27 +03:00
|
|
|
$this->l10n = $l10n;
|
2017-01-04 13:10:19 +03:00
|
|
|
$this->clientService = $clientService;
|
2014-10-21 18:45:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This fetcher handles all the remaining urls therefore always returns true
|
|
|
|
*/
|
2016-04-09 22:23:36 +03:00
|
|
|
public function canHandle($url) {
|
2014-10-21 18:45:36 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fetch a feed from remote
|
|
|
|
* @param string $url remote url of the feed
|
2014-10-21 20:19:23 +04:00
|
|
|
* @param boolean $getFavicon if the favicon should also be fetched,
|
|
|
|
* defaults to true
|
2014-10-22 13:06:43 +04:00
|
|
|
* @param string $lastModified a last modified value from an http header
|
|
|
|
* defaults to false. If lastModified matches the http header from the feed
|
|
|
|
* no results are fetched
|
|
|
|
* @param string $etag an etag from an http header.
|
|
|
|
* If lastModified matches the http header from the feed
|
|
|
|
* no results are fetched
|
2015-08-10 21:20:30 +03:00
|
|
|
* @param bool fullTextEnabled if true tells the fetcher to enhance the
|
|
|
|
* articles by fetching custom enhanced content
|
2016-03-25 17:31:17 +03:00
|
|
|
* @param string $basicAuthUser if given, basic auth is set for this feed
|
|
|
|
* @param string $basicAuthPassword if given, basic auth is set for this
|
|
|
|
* feed. Ignored if user is null or an empty string
|
2014-11-05 13:58:59 +03:00
|
|
|
* @throws FetcherException if it fails
|
2014-10-21 18:45:36 +04:00
|
|
|
* @return array an array containing the new feed and its items, first
|
|
|
|
* element being the Feed and second element being an array of Items
|
|
|
|
*/
|
2016-04-09 22:23:36 +03:00
|
|
|
public function fetch($url, $getFavicon = true, $lastModified = null,
|
|
|
|
$etag = null, $fullTextEnabled = false,
|
|
|
|
$basicAuthUser = null, $basicAuthPassword = null) {
|
2014-11-05 13:30:27 +03:00
|
|
|
try {
|
2016-03-25 17:31:17 +03:00
|
|
|
if ($basicAuthUser !== null && trim($basicAuthUser) !== '') {
|
|
|
|
$resource = $this->reader->discover($url, $lastModified, $etag,
|
2016-04-09 22:23:36 +03:00
|
|
|
$basicAuthUser,
|
|
|
|
$basicAuthPassword);
|
2016-03-25 17:31:17 +03:00
|
|
|
} else {
|
|
|
|
$resource = $this->reader->discover($url, $lastModified, $etag);
|
|
|
|
}
|
2014-10-22 12:49:34 +04:00
|
|
|
|
2014-11-05 13:30:27 +03:00
|
|
|
if (!$resource->isModified()) {
|
|
|
|
return [null, null];
|
|
|
|
}
|
2014-10-21 18:45:36 +04:00
|
|
|
|
2014-11-05 13:30:27 +03:00
|
|
|
$location = $resource->getUrl();
|
|
|
|
$etag = $resource->getEtag();
|
|
|
|
$content = $resource->getContent();
|
|
|
|
$encoding = $resource->getEncoding();
|
|
|
|
$lastModified = $resource->getLastModified();
|
2014-10-22 13:35:12 +04:00
|
|
|
|
2014-11-05 13:30:27 +03:00
|
|
|
$parser = $this->reader->getParser($location, $content, $encoding);
|
2014-10-22 03:35:59 +04:00
|
|
|
|
2015-08-10 21:20:30 +03:00
|
|
|
if ($fullTextEnabled) {
|
|
|
|
$parser->enableContentGrabber();
|
2017-01-04 13:10:19 +03:00
|
|
|
$parser->getItemPostProcessor()->register(new LWNProcessor($basicAuthUser, $basicAuthPassword, $this->clientService));
|
2015-08-10 21:20:30 +03:00
|
|
|
}
|
|
|
|
|
2014-10-22 03:35:59 +04:00
|
|
|
$parsedFeed = $parser->execute();
|
|
|
|
|
2014-11-05 13:30:27 +03:00
|
|
|
$feed = $this->buildFeed(
|
|
|
|
$parsedFeed, $url, $getFavicon, $lastModified, $etag, $location
|
|
|
|
);
|
2014-10-21 18:45:36 +04:00
|
|
|
|
|
|
|
$items = [];
|
2016-04-09 22:23:36 +03:00
|
|
|
foreach ($parsedFeed->getItems() as $item) {
|
2015-08-26 14:03:55 +03:00
|
|
|
$items[] = $this->buildItem($item, $parsedFeed);
|
2014-10-21 18:45:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return [$feed, $items];
|
|
|
|
|
2016-04-09 22:23:36 +03:00
|
|
|
} catch (Exception $ex) {
|
2016-04-11 21:09:59 +03:00
|
|
|
$this->handleError($ex, $url);
|
2016-03-26 22:18:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2014-11-05 13:30:27 +03:00
|
|
|
|
2016-04-11 21:09:59 +03:00
|
|
|
private function handleError(Exception $ex, $url) {
|
2016-03-26 22:18:19 +03:00
|
|
|
$msg = $ex->getMessage();
|
|
|
|
|
|
|
|
if ($ex instanceof MalFormedXmlException) {
|
|
|
|
$msg = $this->l10n->t('Feed contains invalid XML');
|
|
|
|
} else if ($ex instanceof SubscriptionNotFoundException) {
|
2017-07-13 17:52:30 +03:00
|
|
|
$msg = $this->l10n->t('Feed not found: Either the website ' .
|
2016-04-09 22:23:36 +03:00
|
|
|
'does not provide a feed or blocks access. To rule out ' .
|
|
|
|
'blocking, try to download the feed on your server\'s ' .
|
2016-04-11 21:09:59 +03:00
|
|
|
'command line using curl: curl ' . $url);
|
2016-03-26 22:18:19 +03:00
|
|
|
} else if ($ex instanceof UnsupportedFeedFormatException) {
|
|
|
|
$msg = $this->l10n->t('Detected feed format is not supported');
|
|
|
|
} else if ($ex instanceof InvalidCertificateException) {
|
2016-04-09 22:23:36 +03:00
|
|
|
$msg = $this->buildCurlSslErrorMessage($ex->getCode());
|
2016-03-26 22:18:19 +03:00
|
|
|
} else if ($ex instanceof InvalidUrlException) {
|
|
|
|
$msg = $this->l10n->t('Website not found');
|
|
|
|
} else if ($ex instanceof MaxRedirectException) {
|
|
|
|
$msg = $this->l10n->t('More redirects than allowed, aborting');
|
|
|
|
} else if ($ex instanceof MaxSizeException) {
|
|
|
|
$msg = $this->l10n->t('Bigger than maximum allowed size');
|
|
|
|
} else if ($ex instanceof TimeoutException) {
|
|
|
|
$msg = $this->l10n->t('Request timed out');
|
|
|
|
} else if ($ex instanceof UnauthorizedException) {
|
|
|
|
$msg = $this->l10n->t('Required credentials for feed were ' .
|
2016-04-09 22:23:36 +03:00
|
|
|
'either missing or incorrect');
|
2016-03-26 22:18:19 +03:00
|
|
|
} else if ($ex instanceof ForbiddenException) {
|
|
|
|
$msg = $this->l10n->t('Forbidden to access feed');
|
2014-10-21 18:45:36 +04:00
|
|
|
}
|
|
|
|
|
2016-03-26 22:18:19 +03:00
|
|
|
throw new FetcherException($msg);
|
2014-10-21 18:45:36 +04:00
|
|
|
}
|
|
|
|
|
2016-04-09 22:23:36 +03:00
|
|
|
private function buildCurlSslErrorMessage($errorCode) {
|
|
|
|
switch ($errorCode) {
|
|
|
|
case 35: // CURLE_SSL_CONNECT_ERROR
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: A problem occurred ' .
|
|
|
|
'somewhere in the SSL/TLS handshake. Could be ' .
|
|
|
|
'certificates (file formats, paths, permissions), ' .
|
|
|
|
'passwords, and others.'
|
|
|
|
);
|
|
|
|
case 51: // CURLE_PEER_FAILED_VERIFICATION
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: The remote server\'s SSL ' .
|
|
|
|
'certificate or SSH md5 fingerprint was deemed not OK.'
|
|
|
|
);
|
|
|
|
case 58: // CURLE_SSL_CERTPROBLEM
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: Problem with the local client ' .
|
|
|
|
'certificate.'
|
|
|
|
);
|
|
|
|
case 59: // CURLE_SSL_CIPHER
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: Couldn\'t use specified cipher.'
|
|
|
|
);
|
|
|
|
case 60: // CURLE_SSL_CACERT
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: Peer certificate cannot be ' .
|
|
|
|
'authenticated with known CA certificates.'
|
|
|
|
);
|
|
|
|
case 64: // CURLE_USE_SSL_FAILED
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: Requested FTP SSL level failed.'
|
|
|
|
);
|
|
|
|
case 66: // CURLE_SSL_ENGINE_INITFAILED
|
|
|
|
return $this->l10n->t(
|
2017-07-09 10:04:04 +03:00
|
|
|
'Certificate error: Initiating the SSL engine failed.'
|
2016-04-09 22:23:36 +03:00
|
|
|
);
|
|
|
|
case 77: // CURLE_SSL_CACERT_BADFILE
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: Problem with reading the SSL CA ' .
|
|
|
|
'cert (path? access rights?)'
|
|
|
|
);
|
|
|
|
case 83: // CURLE_SSL_ISSUER_ERROR
|
|
|
|
return $this->l10n->t(
|
|
|
|
'Certificate error: Issuer check failed'
|
|
|
|
);
|
|
|
|
default:
|
|
|
|
return $this->l10n->t('Unknown SSL certificate error!');
|
|
|
|
}
|
|
|
|
}
|
2014-10-21 18:45:36 +04:00
|
|
|
|
|
|
|
private function decodeTwice($string) {
|
2015-11-02 22:52:56 +03:00
|
|
|
return html_entity_decode(
|
2016-04-09 22:23:36 +03:00
|
|
|
html_entity_decode(
|
|
|
|
$string, ENT_QUOTES | ENT_HTML5, 'UTF-8'
|
|
|
|
),
|
2015-11-02 22:52:56 +03:00
|
|
|
ENT_QUOTES | ENT_HTML5, 'UTF-8'
|
2014-10-21 18:45:36 +04:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-08-26 14:03:55 +03:00
|
|
|
protected function determineRtl($parsedItem, $parsedFeed) {
|
2015-08-26 14:14:15 +03:00
|
|
|
$itemLang = $parsedItem->getLanguage();
|
|
|
|
$feedLang = $parsedFeed->getLanguage();
|
|
|
|
|
|
|
|
if ($itemLang) {
|
|
|
|
return Parser::isLanguageRTL($itemLang);
|
|
|
|
} else {
|
|
|
|
return Parser::isLanguageRTL($feedLang);
|
|
|
|
}
|
2015-08-26 14:03:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected function buildItem($parsedItem, $parsedFeed) {
|
2014-10-21 18:45:36 +04:00
|
|
|
$item = new Item();
|
|
|
|
$item->setUnread();
|
2014-10-30 13:02:32 +03:00
|
|
|
$item->setUrl($parsedItem->getUrl());
|
2014-10-30 13:30:12 +03:00
|
|
|
$item->setGuid($parsedItem->getId());
|
2014-11-13 16:14:38 +03:00
|
|
|
$item->setGuidHash($item->getGuid());
|
2017-01-04 13:09:46 +03:00
|
|
|
$item->setPubDate($parsedItem->getPublishedDate()->getTimestamp());
|
|
|
|
$item->setUpdatedDate($parsedItem->getUpdatedDate()->getTimestamp());
|
2015-08-26 14:03:55 +03:00
|
|
|
$item->setRtl($this->determineRtl($parsedItem, $parsedFeed));
|
2014-10-21 18:45:36 +04:00
|
|
|
|
|
|
|
// unescape content because angularjs helps against XSS
|
2014-10-22 03:35:59 +04:00
|
|
|
$item->setTitle($this->decodeTwice($parsedItem->getTitle()));
|
2014-10-30 13:30:12 +03:00
|
|
|
$item->setAuthor($this->decodeTwice($parsedItem->getAuthor()));
|
2014-10-21 18:45:36 +04:00
|
|
|
|
|
|
|
// purification is done in the service layer
|
2014-10-22 16:26:43 +04:00
|
|
|
$body = $parsedItem->getContent();
|
2014-10-23 16:00:35 +04:00
|
|
|
$body = mb_convert_encoding($body, 'HTML-ENTITIES',
|
|
|
|
mb_detect_encoding($body));
|
2014-10-22 16:26:43 +04:00
|
|
|
$item->setBody($body);
|
2014-10-21 18:45:36 +04:00
|
|
|
|
2014-10-22 03:35:59 +04:00
|
|
|
$enclosureUrl = $parsedItem->getEnclosureUrl();
|
2016-04-09 22:23:36 +03:00
|
|
|
if ($enclosureUrl) {
|
2014-10-22 03:35:59 +04:00
|
|
|
$enclosureType = $parsedItem->getEnclosureType();
|
2016-04-09 22:23:36 +03:00
|
|
|
if (stripos($enclosureType, 'audio/') !== false ||
|
|
|
|
stripos($enclosureType, 'video/') !== false
|
|
|
|
) {
|
2014-10-21 18:45:36 +04:00
|
|
|
$item->setEnclosureMime($enclosureType);
|
2014-10-22 03:35:59 +04:00
|
|
|
$item->setEnclosureLink($enclosureUrl);
|
2014-10-21 18:45:36 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-20 16:41:31 +03:00
|
|
|
$item->generateSearchIndex();
|
|
|
|
|
2014-10-21 18:45:36 +04:00
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-10-22 13:06:43 +04:00
|
|
|
protected function buildFeed($parsedFeed, $url, $getFavicon, $modified,
|
2014-10-30 13:30:12 +03:00
|
|
|
$etag, $location) {
|
2014-10-21 18:45:36 +04:00
|
|
|
$feed = new Feed();
|
|
|
|
|
2014-12-17 11:01:54 +03:00
|
|
|
$link = $parsedFeed->getSiteUrl();
|
2014-10-21 18:45:36 +04:00
|
|
|
|
2014-10-30 13:30:12 +03:00
|
|
|
if (!$link) {
|
|
|
|
$link = $location;
|
2014-10-21 18:45:36 +04:00
|
|
|
}
|
|
|
|
|
2014-10-30 13:30:12 +03:00
|
|
|
// unescape content because angularjs helps against XSS
|
|
|
|
$title = strip_tags($this->decodeTwice($parsedFeed->getTitle()));
|
2014-10-21 18:45:36 +04:00
|
|
|
$feed->setTitle($title);
|
2014-10-30 13:30:12 +03:00
|
|
|
$feed->setUrl($url); // the url used to add the feed
|
|
|
|
$feed->setLocation($location); // the url where the feed was found
|
|
|
|
$feed->setLink($link); // <link> attribute in the feed
|
2016-04-09 19:23:00 +03:00
|
|
|
$feed->setHttpLastModified($modified);
|
|
|
|
$feed->setHttpEtag($etag);
|
2014-10-21 18:45:36 +04:00
|
|
|
$feed->setAdded($this->time->getTime());
|
|
|
|
|
|
|
|
if ($getFavicon) {
|
2014-10-23 00:19:14 +04:00
|
|
|
$faviconFetcher = $this->faviconFactory->build();
|
|
|
|
$favicon = $faviconFetcher->find($feed->getLink());
|
2014-10-21 18:45:36 +04:00
|
|
|
$feed->setFaviconLink($favicon);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $feed;
|
|
|
|
}
|
2013-04-18 17:56:12 +04:00
|
|
|
|
2013-04-22 02:41:47 +04:00
|
|
|
}
|