]>
Commit | Line | Data |
---|---|---|
1557cefb A |
1 | <?php |
2 | ||
3 | /** | |
4 | * Extract title from an HTML document. | |
5 | * | |
6 | * @param string $html HTML content where to look for a title. | |
7 | * | |
8 | * @return bool|string Extracted title if found, false otherwise. | |
9 | */ | |
10 | function html_extract_title($html) | |
11 | { | |
12 | if (preg_match('!<title>(.*)</title>!is', $html, $matches)) { | |
13 | return trim(str_replace("\n", ' ', $matches[1])); | |
14 | } | |
15 | return false; | |
16 | } | |
17 | ||
18 | /** | |
19 | * Determine charset from downloaded page. | |
20 | * Priority: | |
21 | * 1. HTTP headers (Content type). | |
22 | * 2. HTML content page (tag <meta charset>). | |
23 | * 3. Use a default charset (default: UTF-8). | |
24 | * | |
25 | * @param array $headers HTTP headers array. | |
26 | * @param string $htmlContent HTML content where to look for charset. | |
27 | * @param string $defaultCharset Default charset to apply if other methods failed. | |
28 | * | |
29 | * @return string Determined charset. | |
30 | */ | |
31 | function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') | |
32 | { | |
33 | if ($charset = headers_extract_charset($headers)) { | |
34 | return $charset; | |
35 | } | |
36 | ||
37 | if ($charset = html_extract_charset($htmlContent)) { | |
38 | return $charset; | |
39 | } | |
40 | ||
41 | return $defaultCharset; | |
42 | } | |
43 | ||
44 | /** | |
45 | * Extract charset from HTTP headers if it's defined. | |
46 | * | |
47 | * @param array $headers HTTP headers array. | |
48 | * | |
49 | * @return bool|string Charset string if found (lowercase), false otherwise. | |
50 | */ | |
51 | function headers_extract_charset($headers) | |
52 | { | |
53 | if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { | |
54 | preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); | |
55 | if (! empty($match[1])) { | |
56 | return strtolower(trim($match[1])); | |
57 | } | |
58 | } | |
59 | ||
60 | return false; | |
61 | } | |
62 | ||
63 | /** | |
64 | * Extract charset HTML content (tag <meta charset>). | |
65 | * | |
66 | * @param string $html HTML content where to look for charset. | |
67 | * | |
68 | * @return bool|string Charset string if found, false otherwise. | |
69 | */ | |
70 | function html_extract_charset($html) | |
71 | { | |
72 | // Get encoding specified in HTML header. | |
73 | preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); | |
74 | if (!empty($enc[1])) { | |
75 | return strtolower($enc[1]); | |
76 | } | |
77 | ||
78 | return false; | |
79 | } |