|
本帖最后由 faunus 于 2014-2-25 09:20 编辑
(这些都不是最完美的,那个研究中...)
(1)原始方案
System.Text.Encoding.Default
采用当前的默认编码,可以解决大部分问题,但是很多跟情况下同样会出错。
(2)HTTPWebrequest 的CharacterSet(MSND上给的方案,本来应该的做法)
该属性可以返回该网站编码,但是总为空。
(3)老外的解决之道
Sven Groot
I came across a very silly (and annoying) bug in the HttpWebResponse.CharacterSet property. Let's see if you can spot it (code extracted with reflector):
- public string get_CharacterSet()
- {
- this.CheckDisposed();
- string text1 = this.m_HttpResponseHeaders.ContentType;
- if ((this.m_CharacterSet == null) && !ValidationHelper.IsBlankString(text1))
- {
- this.m_CharacterSet = string.Empty;
- string text2 = text1.ToLower(CultureInfo.InvariantCulture);
- if (text2.Trim().StartsWith("text/"))
- {
- this.m_CharacterSet = "ISO-8859-1";
- }
- int num1 = text2.IndexOf(";");
- if (num1 > 0)
- {
- while ((num1 = text2.IndexOf("charset", num1)) >= 0)
- {
- num1 += 7;
- if ((text2[num1 - 8] == ';') || (text2[num1 - 8] == ' '))
- {
- while ((num1 < text2.Length) && (text2[num1] == ' '))
- {
- num1++;
- }
- if ((num1 < (text2.Length - 1)) && (text2[num1] == '='))
- {
- num1++;
- int num2 = text2.IndexOf(';', num1);
- if (num2 > num1)
- {
- this.m_CharacterSet = text1.Substring(num1, num2).Trim();
- break;
- }
- this.m_CharacterSet = text1.Substring(num1).Trim();
- break;
- }
- }
- }
- }
- }
- return this.m_CharacterSet;
- }
复制代码
(4)另一种方法(先获取内容,再根据内容来判)
- static
- string GetHtml(string url, Encoding encoding) { byte[] buf =
- new WebClient().DownloadData(url); if (encoding !=
- null) return encoding.GetString(buf); string html = Encoding.UTF8.GetString(buf); encoding = GetEncoding(html); if (encoding ==
- null
- || encoding == Encoding.UTF8) return html; return encoding.GetString(buf); }
复制代码
(5)结合3/4的办法(当然去看HttpWebResponse函数的源代码了)
//得到CharacterSet
- private
- string getEncoding (HttpWebResponse httpResp)
- {
- string contentType = httpResp.ContentType ;
- int i = contentType.IndexOf("charset=");
- if (i>=0) { i +=
- 8;
- int j = contentType.IndexOf(';', i);
- if (j>=i) {
- return contentType.Substring(i,j-i).Trim(); }
- return contentType.Substring(i); }
- return
- string.Empty;}
复制代码
|
|