зеркало из https://github.com/nextcloud/news.git
197 строки
6.2 KiB
PHP
197 строки
6.2 KiB
PHP
<?php
|
|
/**
|
|
* ownCloud - News
|
|
*
|
|
* This file is licensed under the Affero General Public License version 3 or
|
|
* later. See the COPYING file.
|
|
*
|
|
* @author Alessandro Cosentino <cosenal@gmail.com>
|
|
* @author Bernhard Posselt <dev@bernhard-posselt.com>
|
|
* @copyright Alessandro Cosentino 2012
|
|
* @copyright Bernhard Posselt 2012, 2014
|
|
*/
|
|
|
|
namespace OCA\News\ArticleEnhancer;
|
|
|
|
use DOMDocument;
|
|
use DOMXpath;
|
|
|
|
use PicoFeed\Encoding\Encoding;
|
|
|
|
use OCA\News\Utility\PicoFeedClientFactory;
|
|
|
|
use OCA\News\Db\Item;
|
|
|
|
|
|
class XPathArticleEnhancer implements ArticleEnhancer {
|
|
|
|
private $clientFactory;
|
|
private $regexXPathPair;
|
|
|
|
|
|
/**
|
|
* @param \Utility\PicoFeedClientFactory $clientFactory
|
|
* @param array $regexXPathPair an associative array containing regex to
|
|
* match the url and the xpath that should be used for it to extract the
|
|
* page
|
|
*/
|
|
public function __construct(PicoFeedClientFactory $clientFactory,
|
|
array $regexXPathPair){
|
|
$this->clientFactory = $clientFactory;
|
|
$this->regexXPathPair = $regexXPathPair;
|
|
}
|
|
|
|
/**
|
|
* @param \OCA\News\Db\Item $item
|
|
* @return \OCA\News\Db\Item enhanced item
|
|
*/
|
|
public function enhance(Item $item){
|
|
|
|
foreach($this->regexXPathPair as $regex => $search) {
|
|
|
|
if(preg_match($regex, $item->getUrl())) {
|
|
$body = $this->getFile($item->getUrl());
|
|
|
|
// First check if either <meta charset="..."> or
|
|
// <meta http-equiv="Content-Type" ...> is specified and use it
|
|
// If this fails use mb_detect_encoding()
|
|
$regex = '/<meta\s+[^>]*(?:charset\s*=\s*[\'"]([^>\'"]*)[\'"]' .
|
|
'|http-equiv\s*=\s*[\'"]content-type[\'"]\s+[^>]*' .
|
|
'content\s*=\s*[\'"][^>]*charset=([^>]*)[\'"])[^>]*>' .
|
|
'/i';
|
|
if(preg_match($regex, $body, $matches)) {
|
|
$enc = strtoupper($matches[sizeof($matches) - 1]);
|
|
} else {
|
|
$enc = mb_detect_encoding($body);
|
|
}
|
|
$enc = $enc ? $enc : 'UTF-8';
|
|
$body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
|
|
if (trim($body) === '') {
|
|
return $item;
|
|
}
|
|
|
|
$dom = new DOMDocument();
|
|
$isOk = @$dom->loadHTML($body);
|
|
|
|
$xpath = new DOMXpath($dom);
|
|
$xpathResult = $xpath->evaluate($search);
|
|
|
|
// in case it wasnt a text query assume its a dom element and
|
|
// convert it to text
|
|
if(!is_string($xpathResult)) {
|
|
$xpathResult = $this->domToString($xpathResult);
|
|
}
|
|
|
|
$xpathResult = trim($xpathResult);
|
|
|
|
// convert all relative to absolute URLs
|
|
$xpathResult = $this->substituteRelativeLinks(
|
|
$xpathResult, $item->getUrl()
|
|
);
|
|
|
|
if($isOk && $xpathResult !== false && $xpathResult !== '') {
|
|
$item->setBody($xpathResult);
|
|
}
|
|
}
|
|
}
|
|
|
|
return $item;
|
|
}
|
|
|
|
|
|
private function getFile($url) {
|
|
$client = $this->clientFactory->build();
|
|
$client->execute($url);
|
|
$client->setUserAgent('Mozilla/5.0 AppleWebKit');
|
|
return $client->getContent();
|
|
}
|
|
|
|
|
|
/**
|
|
* Method which converts all relative "href" and "src" URLs of
|
|
* a HTML snippet with their absolute equivalent
|
|
* @param string $xmlString a HTML snippet as string with the relative URLs
|
|
* to be replaced
|
|
* @param string $absoluteUrl the approptiate absolute url of the HTML
|
|
* snippet
|
|
* @return string the result HTML snippet as a string
|
|
*/
|
|
protected function substituteRelativeLinks($xmlString, $absoluteUrl) {
|
|
$dom = new DOMDocument();
|
|
$dom->preserveWhiteSpace = false;
|
|
|
|
if($xmlString === '') {
|
|
return '';
|
|
}
|
|
|
|
$xmlString = '<div>' . $xmlString . '</div>';
|
|
$isOk = @$dom->loadHTML($xmlString, LIBXML_HTML_NOIMPLIED |
|
|
LIBXML_HTML_NODEFDTD);
|
|
|
|
if(!$isOk) {
|
|
return '';
|
|
}
|
|
|
|
foreach (['href', 'src'] as $attribute) {
|
|
$xpath = new DOMXpath($dom);
|
|
$xpathResult = $xpath->query(
|
|
"//*[@" . $attribute . " " .
|
|
"and not(contains(@" . $attribute . ", '://')) " .
|
|
"and not(starts-with(@" . $attribute . ", 'mailto:')) " .
|
|
"and not(starts-with(@" . $attribute . ", '//'))]");
|
|
foreach ($xpathResult as $linkNode) {
|
|
$urlElement = $linkNode->attributes->getNamedItem($attribute);
|
|
$abs = $this->relativeToAbsoluteUrl(
|
|
$urlElement->nodeValue, $absoluteUrl
|
|
);
|
|
$urlElement->nodeValue = htmlspecialchars($abs);
|
|
}
|
|
}
|
|
|
|
$xmlString = $dom->saveHTML();
|
|
|
|
// domdocument spoils the string with line breaks between the elements
|
|
// strip them
|
|
$xmlString = str_replace("\n", '', $xmlString);
|
|
|
|
return $xmlString;
|
|
}
|
|
|
|
|
|
/**
|
|
* Method which builds a URL by taking a relative URL and its corresponding
|
|
* absolute URL
|
|
* @param string $relativeUrl the relative URL
|
|
* @param string $absoluteUrl the absolute URL with at least scheme and host
|
|
* @return string the resulting absolute URL
|
|
*/
|
|
protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) {
|
|
$base = new \Net_URL2($absoluteUrl);
|
|
return $base->resolve($relativeUrl);
|
|
}
|
|
|
|
|
|
/**
|
|
* Method which turns an xpath result to a string
|
|
* you can customize it by overwriting this method
|
|
* @param mixed $xpathResult the result from the xpath query
|
|
* @return string the result as a string
|
|
*/
|
|
protected function domToString($xpathResult) {
|
|
$result = '';
|
|
foreach($xpathResult as $node) {
|
|
$result .= $this->toInnerHTML($node);
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
|
|
protected function toInnerHTML($node) {
|
|
$dom = new DOMDocument();
|
|
$dom->appendChild($dom->importNode($node, true));
|
|
return trim($dom->saveHTML($dom->documentElement));
|
|
}
|
|
|
|
|
|
}
|