application/bookmark/LinkUtils.php

   1 <?php
   2
   3 use Shaarli\Bookmark\Bookmark;
   4 use Shaarli\Formatter\BookmarkDefaultFormatter;
   5
   6 /**
   7  * Extract title from an HTML document.
   8  *
   9  * @param string $html HTML content where to look for a title.
  10  *
  11  * @return bool|string Extracted title if found, false otherwise.
  12  */
  13 function html_extract_title($html)
  14 {
  15     if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
  16         return trim(str_replace("\n", '', $matches[1]));
  17     }
  18     return false;
  19 }
  20
  21 /**
  22  * Extract charset from HTTP header if it's defined.
  23  *
  24  * @param string $header HTTP header Content-Type line.
  25  *
  26  * @return bool|string Charset string if found (lowercase), false otherwise.
  27  */
  28 function header_extract_charset($header)
  29 {
  30     preg_match('/charset=["\']?([^; "\']+)/i', $header, $match);
  31     if (! empty($match[1])) {
  32         return strtolower(trim($match[1]));
  33     }
  34
  35     return false;
  36 }
  37
  38 /**
  39  * Extract charset HTML content (tag <meta charset>).
  40  *
  41  * @param string $html HTML content where to look for charset.
  42  *
  43  * @return bool|string Charset string if found, false otherwise.
  44  */
  45 function html_extract_charset($html)
  46 {
  47     // Get encoding specified in HTML header.
  48     preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
  49     if (!empty($enc[1])) {
  50         return strtolower($enc[1]);
  51     }
  52
  53     return false;
  54 }
  55
  56 /**
  57  * Extract meta tag from HTML content in either:
  58  *   - OpenGraph: <meta property="og:[tag]" ...>
  59  *   - Meta tag: <meta name="[tag]" ...>
  60  *
  61  * @param string $tag  Name of the tag to retrieve.
  62  * @param string $html HTML content where to look for charset.
  63  *
  64  * @return bool|string Charset string if found, false otherwise.
  65  */
  66 function html_extract_tag($tag, $html)
  67 {
  68     $propertiesKey = ['property', 'name', 'itemprop'];
  69     $properties = implode('|', $propertiesKey);
  70     // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
  71     $orCondition  = '["\']?(?:og:)?' . $tag . '["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
  72     // Support quotes in double quoted content, and the other way around
  73     $content = 'content=(["\'])((?:(?!\1).)*)\1';
  74     // Try to retrieve OpenGraph tag.
  75     $ogRegex = '#<meta[^>]+(?:' . $properties . ')=(?:' . $orCondition . ')[^>]*' . $content . '.*?>#';
  76     // If the attributes are not in the order property => content (e.g. Github)
  77     // New regex to keep this readable... more or less.
  78     $ogRegexReverse = '#<meta[^>]+' . $content . '[^>]+(?:' . $properties . ')=(?:' . $orCondition . ').*?>#';
  79
  80     if (
  81         preg_match($ogRegex, $html, $matches) > 0
  82         || preg_match($ogRegexReverse, $html, $matches) > 0
  83     ) {
  84         return $matches[2];
  85     }
  86
  87     return false;
  88 }
  89
  90 /**
  91  * In a string, converts URLs to clickable bookmarks.
  92  *
  93  * @param string $text       input string.
  94  *
  95  * @return string returns $text with all bookmarks converted to HTML bookmarks.
  96  *
  97  * @see Function inspired from http://www.php.net/manual/en/function.preg-replace.php#85722
  98  */
  99 function text2clickable($text)
 100 {
 101     $regex = '!(((?:https?|ftp|file)://|apt:|magnet:)\S+[a-z0-9\(\)]/?)!si';
 102     $format = function (array $match): string {
 103         return '<a href="' .
 104             str_replace(
 105                 BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_OPEN,
 106                 '',
 107                 str_replace(BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_CLOSE, '', $match[1])
 108             ) .
 109             '">' . $match[1] . '</a>'
 110         ;
 111     };
 112
 113     return preg_replace_callback($regex, $format, $text);
 114 }
 115
 116 /**
 117  * Auto-link hashtags.
 118  *
 119  * @param string $description Given description.
 120  * @param string $indexUrl    Root URL.
 121  *
 122  * @return string Description with auto-linked hashtags.
 123  */
 124 function hashtag_autolink($description, $indexUrl = '')
 125 {
 126     $tokens = '(?:' . BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_OPEN . ')' .
 127               '(?:' . BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_CLOSE . ')'
 128     ;
 129     /*
 130      * To support unicode: http://stackoverflow.com/a/35498078/1484919
 131      * \p{Pc} - to match underscore
 132      * \p{N} - numeric character in any script
 133      * \p{L} - letter from any language
 134      * \p{Mn} - any non marking space (accents, umlauts, etc)
 135      */
 136     $regex = '/(^|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}' . $tokens . ']+)/mui';
 137     $format = function (array $match) use ($indexUrl): string {
 138         $cleanMatch = str_replace(
 139             BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_OPEN,
 140             '',
 141             str_replace(BookmarkDefaultFormatter::SEARCH_HIGHLIGHT_CLOSE, '', $match[2])
 142         );
 143         return $match[1] . '<a href="' . $indexUrl . './add-tag/' . $cleanMatch . '"' .
 144             ' title="Hashtag ' . $cleanMatch . '">' .
 145                 '#' . $match[2] .
 146         '</a>';
 147     };
 148
 149     return preg_replace_callback($regex, $format, $description);
 150 }
 151
 152 /**
 153  * This function inserts &nbsp; where relevant so that multiple spaces are properly displayed in HTML
 154  * even in the absence of <pre>  (This is used in description to keep text formatting).
 155  *
 156  * @param string $text input text.
 157  *
 158  * @return string formatted text.
 159  */
 160 function space2nbsp($text)
 161 {
 162     return preg_replace('/(^| ) /m', '$1&nbsp;', $text);
 163 }
 164
 165 /**
 166  * Format Shaarli's description
 167  *
 168  * @param string $description shaare's description.
 169  * @param string $indexUrl    URL to Shaarli's index.
 170  * @param bool   $autolink    Turn on/off automatic linkifications of URLs and hashtags
 171  *
 172  * @return string formatted description.
 173  */
 174 function format_description($description, $indexUrl = '', $autolink = true)
 175 {
 176     if ($autolink) {
 177         $description = hashtag_autolink(text2clickable($description), $indexUrl);
 178     }
 179
 180     return nl2br(space2nbsp($description));
 181 }
 182
 183 /**
 184  * Generate a small hash for a link.
 185  *
 186  * @param DateTime $date Link creation date.
 187  * @param int      $id   Link ID.
 188  *
 189  * @return string the small hash generated from link data.
 190  */
 191 function link_small_hash($date, $id)
 192 {
 193     return smallHash($date->format(Bookmark::LINK_DATE_FORMAT) . $id);
 194 }
 195
 196 /**
 197  * Returns whether or not the link is an internal note.
 198  * Its URL starts by `?` because it's actually a permalink.
 199  *
 200  * @param string $linkUrl
 201  *
 202  * @return bool true if internal note, false otherwise.
 203  */
 204 function is_note($linkUrl)
 205 {
 206     return isset($linkUrl[0]) && $linkUrl[0] === '?';
 207 }
 208
 209 /**
 210  * Extract an array of tags from a given tag string, with provided separator.
 211  *
 212  * @param string|null $tags      String containing a list of tags separated by $separator.
 213  * @param string      $separator Shaarli's default: ' ' (whitespace)
 214  *
 215  * @return array List of tags
 216  */
 217 function tags_str2array(?string $tags, string $separator): array
 218 {
 219     // For whitespaces, we use the special \s regex character
 220     $separator = $separator === ' ' ? '\s' : $separator;
 221
 222     return preg_split('/\s*' . $separator . '+\s*/', trim($tags) ?? '', -1, PREG_SPLIT_NO_EMPTY);
 223 }
 224
 225 /**
 226  * Return a tag string with provided separator from a list of tags.
 227  * Note that given array is clean up by tags_filter().
 228  *
 229  * @param array|null $tags      List of tags
 230  * @param string     $separator
 231  *
 232  * @return string
 233  */
 234 function tags_array2str(?array $tags, string $separator): string
 235 {
 236     return implode($separator, tags_filter($tags, $separator));
 237 }
 238
 239 /**
 240  * Clean an array of tags: trim + remove empty entries
 241  *
 242  * @param array|null $tags List of tags
 243  * @param string     $separator
 244  *
 245  * @return array
 246  */
 247 function tags_filter(?array $tags, string $separator): array
 248 {
 249     $trimDefault = " \t\n\r\0\x0B";
 250     return array_values(array_filter(array_map(function (string $entry) use ($separator, $trimDefault): string {
 251         return trim($entry, $trimDefault . $separator);
 252     }, $tags ?? [])));
 253 }