aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/LinkUtils.php
blob: d8dc8b5e01a2dc01112cdb5b874626012c650d10 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
<?php

/**
 * Extract title from an HTML document.
 *
 * @param string $html HTML content where to look for a title.
 *
 * @return bool|string Extracted title if found, false otherwise.
 */
function html_extract_title($html)
{
    if (preg_match('!<title>(.*?)</title>!is', $html, $matches)) {
        return trim(str_replace("\n", ' ', $matches[1]));
    }
    return false;
}

/**
 * Determine charset from downloaded page.
 * Priority:
 *   1. HTTP headers (Content type).
 *   2. HTML content page (tag <meta charset>).
 *   3. Use a default charset (default: UTF-8).
 *
 * @param array  $headers           HTTP headers array.
 * @param string $htmlContent       HTML content where to look for charset.
 * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
 * @return string Determined charset.
 */
function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
{
    if ($charset = headers_extract_charset($headers)) {
        return $charset;
    }

    if ($charset = html_extract_charset($htmlContent)) {
        return $charset;
    }

    return $defaultCharset;
}

/**
 * Extract charset from HTTP headers if it's defined.
 *
 * @param array $headers HTTP headers array.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
function headers_extract_charset($headers)
{
    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
        if (! empty($match[1])) {
            return strtolower(trim($match[1]));
        }
    }

    return false;
}

/**
 * Extract charset HTML content (tag <meta charset>).
 *
 * @param string $html HTML content where to look for charset.
 *
 * @return bool|string Charset string if found, false otherwise.
 */
function html_extract_charset($html)
{
    // Get encoding specified in HTML header.
    preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc);
    if (!empty($enc[1])) {
        return strtolower($enc[1]);
    }

    return false;
}