1: <?php
2: /* ***** BEGIN LICENSE BLOCK *****
3: * Version: NPL 1.1/GPL 2.0/LGPL 2.1
4: *
5: * The contents of this file are subject to the Netscape Public License
6: * Version 1.1 (the "License"); you may not use this file except in
7: * compliance with the License. You may obtain a copy of the License at
8: * http://www.mozilla.org/NPL/
9: *
10: * Software distributed under the License is distributed on an "AS IS" basis,
11: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12: * for the specific language governing rights and limitations under the
13: * License.
14: *
15: * The Original Code is Mozilla Communicator client code.
16: *
17: * The Initial Developer of the Original Code is
18: * Netscape Communications Corporation.
19: * Portions created by the Initial Developer are Copyright (C) 1998
20: * the Initial Developer. All Rights Reserved.
21: *
22: * Contributor(s):
23: * Henri Sivonen, hsivonen@iki.fi
24: *
25: *
26: * Alternatively, the contents of this file may be used under the terms of
27: * either the GNU General Public License Version 2 or later (the "GPL"), or
28: * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29: * in which case the provisions of the GPL or the LGPL are applicable instead
30: * of those above. If you wish to allow use of your version of this file only
31: * under the terms of either the GPL or the LGPL, and not to allow others to
32: * use your version of this file under the terms of the NPL, indicate your
33: * decision by deleting the provisions above and replace them with the notice
34: * and other provisions required by the GPL or the LGPL. If you do not delete
35: * the provisions above, a recipient may use your version of this file under
36: * the terms of any one of the NPL, the GPL or the LGPL.
37: *
38: * ***** END LICENSE BLOCK ***** */
39:
40: /*
41: * For the original C++ code, see
42: * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
43: * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
44: *
45: * The latest version of this file can be obtained from
46: * http://iki.fi/hsivonen/php-utf8/
47: *
48: * Version 1.0, 2003-05-30
49: */
50:
51: /**
52: * Takes an UTF-8 string and returns an array of ints representing the
53: * Unicode characters. Astral planes are supported ie. the ints in the
54: * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
55: * are not allowed.
56: *
57: * Returns false if the input string isn't a valid UTF-8 octet sequence.
58: */
59: function utf8ToUnicode(&$str)
60: {
61: $mState = 0; // cached expected number of octets after the current octet
62: // until the beginning of the next UTF8 character sequence
63: $mUcs4 = 0; // cached Unicode character
64: $mBytes = 1; // cached expected number of octets in the current sequence
65:
66: $out = array();
67:
68: $len = strlen($str);
69: for($i = 0; $i < $len; $i++) {
70: $in = ord($str{$i});
71: if (0 == $mState) {
72: // When mState is zero we expect either a US-ASCII character or a
73: // multi-octet sequence.
74: if (0 == (0x80 & ($in))) {
75: // US-ASCII, pass straight through.
76: $out[] = $in;
77: $mBytes = 1;
78: } else if (0xC0 == (0xE0 & ($in))) {
79: // First octet of 2 octet sequence
80: $mUcs4 = ($in);
81: $mUcs4 = ($mUcs4 & 0x1F) << 6;
82: $mState = 1;
83: $mBytes = 2;
84: } else if (0xE0 == (0xF0 & ($in))) {
85: // First octet of 3 octet sequence
86: $mUcs4 = ($in);
87: $mUcs4 = ($mUcs4 & 0x0F) << 12;
88: $mState = 2;
89: $mBytes = 3;
90: } else if (0xF0 == (0xF8 & ($in))) {
91: // First octet of 4 octet sequence
92: $mUcs4 = ($in);
93: $mUcs4 = ($mUcs4 & 0x07) << 18;
94: $mState = 3;
95: $mBytes = 4;
96: } else if (0xF8 == (0xFC & ($in))) {
97: /* First octet of 5 octet sequence.
98: *
99: * This is illegal because the encoded codepoint must be either
100: * (a) not the shortest form or
101: * (b) outside the Unicode range of 0-0x10FFFF.
102: * Rather than trying to resynchronize, we will carry on until the end
103: * of the sequence and let the later error handling code catch it.
104: */
105: $mUcs4 = ($in);
106: $mUcs4 = ($mUcs4 & 0x03) << 24;
107: $mState = 4;
108: $mBytes = 5;
109: } else if (0xFC == (0xFE & ($in))) {
110: // First octet of 6 octet sequence, see comments for 5 octet sequence.
111: $mUcs4 = ($in);
112: $mUcs4 = ($mUcs4 & 1) << 30;
113: $mState = 5;
114: $mBytes = 6;
115: } else {
116: /* Current octet is neither in the US-ASCII range nor a legal first
117: * octet of a multi-octet sequence.
118: */
119: return false;
120: }
121: } else {
122: // When mState is non-zero, we expect a continuation of the multi-octet
123: // sequence
124: if (0x80 == (0xC0 & ($in))) {
125: // Legal continuation.
126: $shift = ($mState - 1) * 6;
127: $tmp = $in;
128: $tmp = ($tmp & 0x0000003F) << $shift;
129: $mUcs4 |= $tmp;
130:
131: if (0 == --$mState) {
132: /* End of the multi-octet sequence. mUcs4 now contains the final
133: * Unicode codepoint to be output
134: *
135: * Check for illegal sequences and codepoints.
136: */
137:
138: // From Unicode 3.1, non-shortest form is illegal
139: if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
140: ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
141: ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
142: (4 < $mBytes) ||
143: // From Unicode 3.2, surrogate characters are illegal
144: (($mUcs4 & 0xFFFFF800) == 0xD800) ||
145: // Codepoints outside the Unicode range are illegal
146: ($mUcs4 > 0x10FFFF)) {
147: return false;
148: }
149: if (0xFEFF != $mUcs4) {
150: // BOM is legal but we don't want to output it
151: $out[] = $mUcs4;
152: }
153: //initialize UTF8 cache
154: $mState = 0;
155: $mUcs4 = 0;
156: $mBytes = 1;
157: }
158: } else {
159: /* ((0xC0 & (*in) != 0x80) && (mState != 0))
160: *
161: * Incomplete multi-octet sequence.
162: */
163: return false;
164: }
165: }
166: }
167: return $out;
168: }
169:
170: /**
171: * Takes an array of ints representing the Unicode characters and returns
172: * a UTF-8 string. Astral planes are supported ie. the ints in the
173: * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
174: * are not allowed.
175: *
176: * Returns false if the input array contains ints that represent
177: * surrogates or are outside the Unicode range.
178: */
179: function unicodeToUtf8(&$arr)
180: {
181: $dest = '';
182: foreach ($arr as $src) {
183: if($src < 0) {
184: return false;
185: } else if ( $src <= 0x007f) {
186: $dest .= chr($src);
187: } else if ($src <= 0x07ff) {
188: $dest .= chr(0xc0 | ($src >> 6));
189: $dest .= chr(0x80 | ($src & 0x003f));
190: } else if($src == 0xFEFF) {
191: // nop -- zap the BOM
192: } else if ($src >= 0xD800 && $src <= 0xDFFF) {
193: // found a surrogate
194: return false;
195: } else if ($src <= 0xffff) {
196: $dest .= chr(0xe0 | ($src >> 12));
197: $dest .= chr(0x80 | (($src >> 6) & 0x003f));
198: $dest .= chr(0x80 | ($src & 0x003f));
199: } else if ($src <= 0x10ffff) {
200: $dest .= chr(0xf0 | ($src >> 18));
201: $dest .= chr(0x80 | (($src >> 12) & 0x3f));
202: $dest .= chr(0x80 | (($src >> 6) & 0x3f));
203: $dest .= chr(0x80 | ($src & 0x3f));
204: } else {
205: // out of range
206: return false;
207: }
208: }
209: return $dest;
210: }
211: function check_string($ics){
212: $ics_file = explode("\n",$ics);
213: foreach($ics_file as $line => $str){
214: if(false === utf8ToUnicode($str)){
215: $error[] = $line;
216: }
217: }
218: if(isset($error) && is_array($error)){
219: foreach($error as $line){
220: dbg_error_log( "LOG check_string","error on lines % invalid character in string %s" , ($line +1),$ics_file[$line] );
221: return false;
222: }
223: } else {
224: // dbg_error_log( "LOG check_string","the string is UTF8 compliant");
225: return true;
226: }
227: }
228: ?>
229: