Markup.class.php

<?php
/**
 * Markup.class.php - Handling of Stud.IP- and HTML-markup.
 **
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of
 * the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * @category    Stud.IP
 * @copyright   (c) 2014 Stud.IP e.V.
 * @license     http://www.gnu.org/licenses/gpl-2.0.html GPL version 2
 * @since       File available since Release 3.0
 * @author      Robert Costa <rcosta@uos.de>
 */
namespace Studip;

require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_ClassifyLinks.php';
require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_ClassifyTables.php';
require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_LinkifyEmail.php';
require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_TransformLinks.php';
require_once __DIR__ . '/htmlpurifier/HTMLPurifier_Injector_Unlinkify.php';

class Markup
{
    /**
     * Apply markup rules and clean the text up.
     *
     * @param TextFormat $markup  Markup rules applied on marked-up text.
     * @param string     $text    Marked-up text on which rules are applied.
     * @param boolean    $trim    Trim text before applying markup rules, if TRUE.
     *
     * @return string  HTML code computed from marked-up text.
     */
    public static function apply($markup, $text, $trim)
    {
        return $markup->format(self::markupToHtml($text, $trim, false));
    }

    // signature for HTML entries
    const HTML_MARKER = '<!--HTML-->';

    // signature for HTML fallback entries
    const HTML_MARKER_FALLBACK = '<!-- HTML: Insert text after this line only. -->';

    // regular expression for detecting HTML signature
    const HTML_MARKER_REGEXP = '/^\s*<!--\s*HTML.*?-->/i';

    /**
     * Return `true` if the WYSIWYG editor is enabled for this user.
     * @deprecated since Stud.IP 5.5
     *
     * @return boolean  always returns `true`.
     */
    public static function editorEnabled()
    {
        return true;
    }

    /**
     * Return `true` for HTML code and `false` for plain text.
     *
     * HTML code must either match `HTML_MARKER_REGEXP` or begin
     * with '<' and end with '>' (leading and trailing whitespace
     * is ignored). Everything else is considered to be plain
     * text.
     *
     * @param string $text  HTML code or plain text.
     *
     * @return boolean  `true` for HTML code, `false` for plain text.
     */
    public static function isHtml($text)
    {
        return self::hasHtmlMarker($text);
    }

    /**
     * Return `true` for Stud.IP-HTML and `false` otherwise.
     *
     * Stud.IP-HTML is HTML that can contain Stud.IP Markup.
     *
     * Stud.IP-HTML must match Stud.IP 3.2's HTML marker.
     * Leading and trailing whitespace is ignored.
     *
     * Everything else is considered not Stud.IP-HTML. In other
     * words, if it's not Stud.IP-HTML it might be everything
     * from plain text to binary code. But usually it's either
     * Stud.IP markup or plain HTML code, then.
     *
     * @param string $text  Text that is or isn't Stud.IP-HTML.
     *
     * @return boolean  `true` for Stud.IP-HTML
     */
    public static function isHtmlFallback($text)
    {
        $text = trim($text);

        // it's not fallback if the new HTML marker is detected
        if (MarkupPrivate\Text\startsWith($text, self::HTML_MARKER)) {
            return false;
        }

        // it's Stud.IP-HTML if Stud.IP 3.2's HTML marker is detected
        if (MarkupPrivate\Text\startsWith($text, self::HTML_MARKER_FALLBACK)) {
            return true;
        }

        return false;
    }

    /**
     * Return `true` for HTML code and `false` for plain text.
     *
     * HTML code must start with a match for `HTML_MARKER_REGEXP`.
     *
     * @param string $text  HTML code or plain text.
     *
     * @return boolean  `true` for HTML code, `false` for plain text.
     */
    public static function hasHtmlMarker($text)
    {
        return preg_match(self::HTML_MARKER_REGEXP, $text);
    }

    /**
     * Mark a given text as HTML code.
     *
     * No sanity-checking is done on the given text. It is simply
     * marked up so to be identified by Markup::isHtml as HTML
     * code.
     *
     * @param string $text  The text to be marked up as HTML code.
     *
     * @return string  The text marked up as HTML code.
     */
    public static function markAsHtml($text)
    {
        // NOTE keep this function in sync with the JavaScript
        // function markAsHtml in WyswygHtmlHead.php
        if (self::hasHtmlMarker($text) || trim($text) === '') {
            return $text; // marker already set, don't set twice
        }
        return self::HTML_MARKER . $text;
    }

    /**
     * Apply markup rules after running text through HTML ready.
     *
     * @param TextFormat $markup  Markup rules applied on marked-up text.
     * @param string     $text    Marked-up text on which rules are applied.
     * @param boolean    $trim    Trim text before applying markup rules, if TRUE.
     *
     * @return string  HTML code computed from marked-up text.
     */
    private static function markupHtmlReady($markup, $text, $trim)
    {
        return str_replace("\n", '<br>', self::markupText(
            $markup, self::htmlReady(self::unixEOL($text), $trim)));
    }

    /**
     * Convert line break to Unix format.
     *
     * @param string $text  Text with possibly mixed line breaks (Win, Mac, Unix).
     *
     * @return string  Text with Unix line breaks only.
     */
    private static function unixEOL($text)
    {
        return preg_replace("/\r\n?/", "\n", $text);
    }

    /**
     * Apply markup rules on plain text.
     *
     * @param TextFormat $markup  Markup rules applied on marked-up text.
     * @param string     $text    Marked-up text on which rules are applied.
     *
     * @return string  HTML code computed from marked-up text.
     */
    private static function markupText($markup, $text)
    {
        return symbol($markup->format($text));
    }

    /**
     * Call HTMLPurifier to create safe HTML.
     *
     * @param   string  $dirty_html  Unsafe or 'uncleaned' HTML code.
     * @param   boolean $autoformat  Apply the AutoFormat rules
     * @return  string              Clean and safe HTML code.
     */
    private static function purify($dirty_html, $autoformat = true)
    {
        $purifier = self::createPurifier($autoformat);

        return $purifier->purify($dirty_html);
    }

    /**
     * Call HTMLPurifier to filter the HTML code (if the source is detected
     * to contain HTML, returns the argument unchanged otherwise). The HTML
     * marker is restored afterwards, if it was present.
     *
     * @param   string  $dirty_html  Unsafe or 'uncleaned' HTML code.
     * @return  string              Clean and safe HTML code.
     */
    public static function purifyHtml($html)
    {
        if ($html instanceof \I18NString) {
            $base = self::purifyHtml($html->original());
            $lang = $html->toArray();

            foreach ($lang as &$value) {
                $value = self::purifyHtml($value);
            }

            return new \I18NString($base, $lang);
        }

        if (self::isHtml($html)) {
            $html = self::markAsHtml(self::purify($html));
        }

        return $html;
    }

    /**
     * Create HTML purifier instance with Stud.IP-specific configuration.
     *
     * @param  boolean $autoformat  Apply the AutoFormat rules
     * @return \HTMLPurifier A new instance of the HTML purifier.
     */
    private static function createPurifier($autoformat)
    {
        $config = \HTMLPurifier_Config::createDefault();
        $config->set('Cache.SerializerPath', $GLOBALS['TMP_PATH']);
        $config->set('Core.RemoveInvalidImg', true);

        // restrict allowed HTML tags and attributes
        //
        // note that changes here should also be reflected in CKEditor's
        // settings!!
        //
        // NOTE The list could be restricted even further by allowing only
        // specific values for some attributes and CSS styles, but that is not
        // directly supported by HTMLPurifier and would need to be implemented
        // with a filter similar to ClassifyLinks.
        //
        // This is a list of further restrictions that can/should be introduced
        // at a later time point maybe, if possible:
        //
        // - always open external links in a new tab or window
        //   a[class="link-extern" href="..." target="_blank"]
        // - only allow left margin and horizontal text alignment to be set in
        //   divs (NOTE maybe remove these two features completely?):
        //   div[style="margin-left:(40|80|...)px; text-align:(center|right|justify)"]
        // - img[style] should only allow float:left or float:right
        // - only allow text color and background color to be set in a span's
        //   style attribute (NOTE 'wiki-links' are currently set here due to
        //   implementation difficulties, but probably this should be
        //   changed...):
        //   span[style="color:(#000000|#800000|...);
        //               background-color:(#000000|#800000|...)"
        //        class="wiki-link"]
        // - tables should always have the class "content" (it should not be
        //   optional and no other class should be set):
        //   table[class="content"]
        // - table headings should have a column and/or a row scope or no scope
        //   at all, but nothing else:
        //   th[scope="(col | row)"]
        // - fonts: only Stud.IP-specific fonts should be allowed
        //
        $config->set('HTML.Allowed', '
            a[class|href|target|rel|name|id]
            audio[controls|src|height|width|style]
            big
            blockquote
            br
            caption
            code[class]
            div[class|style]
            em
            figure[class|style]
            figcaption
            h1
            h2
            h3
            h4
            h5
            h6
            hr
            i
            img[alt|src|height|width|class|style]
            li
            ol[reversed|start|style]
            p[style]
            pre[class]
            span[style|class]
            strong
            u
            ul[style]
            s
            small
            sub
            sup
            table[class|style]
            tbody
            td[colspan|rowspan|style]
            thead
            th[colspan|rowspan|style|scope]
            tr
            tt
            video[controls|src|height|width|style]
        ');

        $config->set('Attr.AllowedFrameTargets', ['_blank']);
        $config->set('Attr.AllowedRel', ['nofollow']);
        $config->set('Attr.EnableID', true);
        $config->set('Attr.AllowedClasses', [
            'author',
            'content',
            'image',
            'image-style-side',
            'image_resized',
            'language-cpp',
            'language-css',
            'language-diff',
            'language-java',
            'language-javascript',
            'language-json',
            'language-php',
            'language-python',
            'language-ruby',
            'language-scss',
            'language-sql',
            'language-xml',
            'link-extern',
            'link-intern',
            'math-tex',
            'table',
            'usercode',
            'wiki-link'
        ]);
        $config->set('CSS.AllowedFonts', [
            'serif',
            'sans-serif',
            'monospace',
            'cursive'
        ]);
        $config->set('CSS.AllowedProperties', [
            'margin-left',
            'text-align',
            'width',
            'height',
            'color',
            'background-color', // needed by span, td
            'border-color',
            'border-style',
            'float',
            'border'
        ]);
        $config->set('CSS.MaxImgLength', null);

        if ($autoformat) {
            $config->set('AutoFormat.Linkify', true);
            $config->set('AutoFormat.Custom', [
                'ClassifyLinks',
                'ClassifyTables',
                'LinkifyEmail'
            ]);
            $config->set('AutoFormat.RemoveSpansWithoutAttributes', true);
        } else {
            $config->set('AutoFormat.Custom', ['TransformLinks']);
        }

        // avoid <img src="evil_CSRF_stuff">
        $def = $config->getHTMLDefinition(true);
        $img = $def->addBlankElement('img');
        $img->attr_transform_post[]
            = new MarkupPrivate\Purifier\AttrTransform_Image_Source();

        $def->addElement('audio', 'Inline', 'Flow', 'Common', [
              'src*' => 'URI',
              'width' => 'Length',
              'height' => 'Length',
              'controls' => 'Text',     // Bool triggers bug in HTMLPurifier
        ]);

        $def->addElement('video', 'Inline', 'Flow', 'Common', [
              'src*' => 'URI',
              'width' => 'Length',
              'height' => 'Length',
              'controls' => 'Text',     // Bool triggers bug in HTMLPurifier
        ]);

        $def->addElement('figcaption', 'Inline', 'Flow', 'Common');
        $def->addElement('figure', 'Block', 'Optional: (figcaption, Flow) | (Flow, figcaption) | Flow', 'Common');

        $def->addAttribute('ol', 'reversed', 'Bool');
        $def->addAttribute('ol', 'style', 'Text');
        $def->addAttribute('ul', 'style', 'Text');

        return new \HTMLPurifier($config);
    }

    /**
     * Convert special characters to HTML entities, and clean up.
     *
     * @param  string  $text  This text's special chars will be converted.
     * @param  boolean $trim  Trim text before applying markup rules, if TRUE.
     * @param  boolean $br    Replace newlines by <br>, if TRUE.
     * @param  boolean $double_encode  Encode existing HTML entities, if TRUE.
     * @return string         The converted string.
     */
    public static function htmlReady(
        $text, $trim = true, $br = false, $double_encode = true
    ) {
        $text = htmlspecialchars($text, ENT_QUOTES, 'utf-8', $double_encode);

        if ($trim) {
            $text = trim($text);
        }
        if ($br) { // fix newlines
            $text = nl2br($text, false);
        }
        return $text;
    }

    /**
     * Prepare text for wysiwyg (if enabled), otherwise convert special
     * characters using htmlReady.
     *
     * @param  string  $text  The text.
     * @param  boolean $trim  Trim text before applying markup rules, if TRUE.
     * @param  boolean $br    Replace newlines by <br>, if TRUE and wysiwyg editor disabled.
     * @param  boolean $double_encode  Encode existing HTML entities, if TRUE and wysiwyg editor disabled.
     * @return string         The converted string.
     */
    public static function wysiwygReady(
        $text, $trim = true, $br = false, $double_encode = true
    ) {
        if (self::editorEnabled()) {
            $text = self::markupToHtml($text, $trim);
        }
        return self::htmlReady($text, $trim, $br, $double_encode);
    }

    /**
     * Convert Stud.IP markup (possibly mixed with HTML if fallback mode is
     * enabled) to editable HTML. Pure HTML will only run through the purifier.
     *
     * @param  string  $text  The text.
     * @param  boolean $trim  Trim text before applying markup rules, if TRUE.
     * @param  boolean $mark  Mark result text as HTML, if TRUE.
     * @return string         The converted string.
     */
    public static function markupToHtml($text, $trim = true, $mark = true)
    {
        if (!trim($text)) {
            return $text;
        }
        if (self::isHtml($text)) {
            $is_fallback = self::isHtmlFallback($text);
            $text = self::purify($text, false);

            if ($is_fallback) {
                $text = self::markupText(new \StudipCoreFormat(), $text);
            }
        } else {
            $text = self::markupHtmlReady(new \StudipCoreFormat(), $text, $trim);
        }

        return $mark ? self::markAsHtml($text) : $text;
    }

    /**
     * Call HTMLPurifier to remove all HTML tags from the string (if the source
     * is detected to contain HTML, returns the argument unchanged otherwise).
     *
     * @param   string  $html  HTML code to filter
     * @return  string         The converted string.
     */
    public static function removeHtml($html)
    {
        if (self::isHtml($html)) {
            $config = \HTMLPurifier_Config::createDefault();
            $config->set('Cache.SerializerPath', $GLOBALS['TMP_PATH']);
            $config->set('HTML.Allowed', 'a[href],img[alt|src],br');
            $config->set('AutoFormat.Custom', ['Unlinkify']);

            $html = str_replace('</li>', '</li><br>', $html);
            $html = str_replace('</ol>', '</ol><br>', $html);
            $html = str_replace('</ul>', '</ul><br>', $html);
            $html = str_replace('</tr>', '</tr><br>', $html);
            $html = str_replace('</p>', '</p><br><br>', $html);
            $html = str_replace('</div>', '</div><br><br>', $html);

            $purifier = new \HTMLPurifier($config);
            $html = $purifier->purify($html);

            // Replace new lines with simple line break; twice because we don't
            // want to create unneccessary white space if a <br /> is followed
            // by a new line
            $html = str_replace('<br />' . PHP_EOL, PHP_EOL, $html);
            $html = str_replace('<br />', PHP_EOL, $html);

            $html = \decodeHTML(trim($html));
        }

        return $html;
    }
}

/**
 * Members of Studip\MarkupPrivate must not be used outside of this file!!
 */

namespace Studip\MarkupPrivate\Purifier;

use Studip\MarkupPrivate\MediaProxy;

/**
 * Remove invalid <img src> attributes.
 */
class AttrTransform_Image_Source extends \HTMLPurifier_AttrTransform
{
    /**
     * Implements abstract method of base class.
     */
    function transform($attr, $config, $context)
    {
        try {
            $attr['src'] = MediaProxy\getMediaUrl($attr['src']);
        } catch (MediaProxy\InvalidInternalLinkException $e) {
            // invalid internal link ==> remove <img src> attribute
            $GLOBALS['msg'][] = _('Ungültige interne Medienverknüpfung entfernt: ')
                . \htmlentities($e->getUrl());
            $attr['src'] = NULL; // remove <img src> attribute
        } catch (MediaProxy\ExternalMediaDeniedException $e) {
            $GLOBALS['msg'][] = _('Verbotene externe Medienverknüpfung entfernt: ')
                . \htmlentities($e->getUrl());
            $attr['src'] = NULL; // remove <img src> attribute
        }
        return $attr;
    }
}

//// media proxy //////////////////////////////////////////////////////////////

namespace Studip\MarkupPrivate\MediaProxy;

use Studip\MarkupPrivate\Text;

/**
 * Check if media proxy should be used and if so return the respective URL.
 *
 * @param string $url   URL to media file.
 * @return mixed        URL string to media file (possibly 'proxied')
 *                      or NULL if URL is invalid.
 */
function getMediaUrl($url) {
    // even though proxied URLs shouldn't be stored in the database, the
    // next line will handle those cases where they're accidentally there
    $url = decodeMediaProxyUrl($url);

    // handle internal media links
    if (isStudipMediaUrl($url)) {
        return transformInternalIdnaLink($url);
    }
    if (isInternalLink($url)) {
        // link is studip-internal, but not to a valid media location
        throw new InvalidInternalLinkException($url);
    }

    // handle external media links
    $external_media = \Config::get()->LOAD_EXTERNAL_MEDIA;
    if ($external_media === 'proxy' &&
        \Seminar_Session::is_current_session_authenticated()
    ) {
        // media proxy must be accessed by an internal link
        return encodeMediaProxyUrl($url);
    }
    if ($external_media === 'allow') {
        return $url;
    }
    throw new ExternalMediaDeniedException($url);
}

/**
 * Return media proxy URL for an unproxied URL.
 *
 * @params string $url  Unproxied media URL.
 * @return string       Media proxy URL for accessing the same resource.
 */
function encodeMediaProxyUrl($url) {
    return transformInternalIdnaLink(
        getMediaProxyUrl() .'?url=' . \urlencode(\idna_link($url)));
}

/**
 * Extract the original URL from a media proxy URL.
 *
 * @param string  $url  The media proxy URL.
 * return string  The original URL. If $url does not point to the media
 *                proxy then this is the exact same value given by $url.
 */
function decodeMediaProxyUrl($url) {
    # TODO make it work for 'url=' at any position in query
    $urlpath = removeStudipDomain($url);
    $proxypath = removeStudipDomain(getMediaProxyUrl()) . '?url=';
    if (Text\startsWith($urlpath, $proxypath)) {
        return \urldecode(Text\removePrefix($urlpath, $proxypath));
    }
    return $url;
}

/**
 * Return Stud.IP's absolute media proxy URL.
 */
function getMediaProxyUrl() {
    return $GLOBALS['ABSOLUTE_URI_STUDIP'] . 'dispatch.php/media_proxy';
}

/**
 * Test if an URL points to a valid internal Stud.IP media path.
 *
 * @param   string   $url  Internal Stud.IP URL.
 * @returns boolean  TRUE for internal media link URLs, FALSE otherwise.
 */
function isStudipMediaUrl($url) {
    return isInternalLink($url) &&
        isStudipMediaUrlPath(getStudipRelativePath($url));
}

function isInternalLink($url) {
    return is_internal_url(transformInternalIdnaLink($url));
}

//// url utilities ////////////////////////////////////////////////////////////

/**
 * Remove domain name from internal URLs.
 *
 * Remove scheme, domain and authentication information from internal
 * Stud.IP URLs. Leave external URLs untouched.
 *
 * @param string $url   URL from which to remove internal domain.
 * @returns string      URL without internal domain or the exact same
 *                      value as $url for external URLs.
 */
function removeStudipDomain($url) {
    if (!isInternalLink($url)) {
        return $url;
    }
    $parsed_url = \parse_url(transformInternalIdnaLink($url));
    $path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
    $query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : '';
    $fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : '';
    return $path . $query . $fragment;
}

/**
 * Return a URL's path component with the absolute Stud.IP path removed.
 *
 * NOTE: If the URL is not an internal Stud.IP URL, the path component will
 * nevertheless be returned without issuing an error message.
 *
 * Example:
 * >>> getStudipRelativePath('http://localhost:8080'
 *      . '/studip/sendfile.php?type=0&file_id=ABC123&file_name=nice.jpg')
 * 'sendfile.php'
 *
 * @param string $url   The URL from which to return the Stud.IP-relative
 *                      path component.
 * returns string Stud.IP-relative path component of $url.
 */
function getStudipRelativePath($url) {
    $parsed_url = \parse_url(transformInternalIdnaLink($url));
    $parsed_studip_url = getParsedStudipUrl();
    return Text\removePrefix($parsed_url['path'], $parsed_studip_url['path']);
}

/**
 * Return an associative array containing the Stud.IP URL elements.
 *
 * see also: http://php.net/manual/en/function.parse-url.php
 *
 * @returns mixed  Same values that PHP's parse_url() returns.
 */
function getParsedStudipUrl() {
    return \parse_url($GLOBALS['ABSOLUTE_URI_STUDIP']);
}

/**
 * Test if path is valid for internal Stud.IP media URLs.
 *
 * @params string $path The path component of an URL.
 * return boolean       TRUE for valid media paths, FALSE otherwise.
 */
function isStudipMediaUrlPath($path) {
    list($path_head) = \explode('/', $path);
    $valid_paths = ['sendfile.php', 'download', 'assets', 'pictures'];
    return \mb_strpos(\urldecode($path), '../') === false && \in_array($path_head, $valid_paths);
}

/**
 * Return a normalized, internal URL.
 *
 * @params string $url  An internal URL.
 * @returns string      Normalized internal URL.
 */
function transformInternalIdnaLink($url) {
    return \idna_link(\TransformInternalLinks($url));
}

//// url exceptions ///////////////////////////////////////////////////////////

class UrlException extends \Exception
{
    private $url;

    public function __construct($url) {
        parent::__construct();
        $this->url = $url;
    }

    public function getUrl()
    {
        return $this->url;
    }
}

class InvalidInternalLinkException extends UrlException
{
}

class ExternalMediaDeniedException extends UrlException
{
}

//// string utilities /////////////////////////////////////////////////////////

namespace Studip\MarkupPrivate\Text;

/**
 * Test if string starts with prefix.
 *
 * @param string $string  Tested string.
 * @param string $prefix  Prefix of tested string.
 *
 * @return boolean  TRUE if string starts with prefix.
 */
function startsWith($string, $prefix) {
    return \mb_substr($string, 0, \mb_strlen($prefix)) === $prefix;
}

/**
 * Test if string ends with suffix.
 *
 * @param string $string  Tested string.
 * @param string $suffix  Suffix of tested string.
 *
 * @return boolean  TRUE if string ends with suffix.
 */
function endsWith($string, $suffix) {
    return \mb_substr($string, - \mb_strlen($suffix)) === $suffix;
}

/**
 * Remove prefix from string.
 *
 * Does not change the string if it has a different prefix.
 *
 * @param string $string The string that must start with the prefix.
 * @param string $prefix The prefix of the string.
 *
 * @return string String without prefix.
 */
function removePrefix($string, $prefix) {
    return startsWith($string, $prefix) ? \mb_substr($string, \mb_strlen($prefix)) : $string;
}