oops, forgot to remove a conflict here

This commit is contained in:
cbiesinger%web.de 2003-02-25 19:04:19 +00:00
Родитель 26fd1afa1f
Коммит 08eabd0243
1 изменённых файлов: 0 добавлений и 459 удалений

Просмотреть файл

@ -3992,465 +3992,6 @@ nsGlobalHistory::OnStopLookup()
return NS_OK;
}
<<<<<<< nsGlobalHistory.cpp
/**
*
* The input features into the autocomplete perceptron are as follows:
*
* Features 1 = Frequency and recency metric for page in history
* (domain = positive real numbers)
* Value decays fast with age of page
* Uses HISTORY_FAST_DECAY_CONSTANT
* Features 2 = Frequency and recency metric for page in history
* (high for newer, more accessed pages)
* Value decays slowly with age of page
* Uses HISTORY_SLOW_DECAY_CONSTANT
* Features 3 = Was the url typed by the user?
* (domain = 0 or 1)
* Features 4 = Recency metric for page in bookmarks
* (domain = real number between 0 and 1)
* Value decays fast with age of bookmark
* Uses BOOKMARKS_FAST_DECAY_CONSTANT
* Features 5 = Recency metric for page in bookmarks
* (domain = real number between 0 and 1)
* Value decays slowly with age of bookmark
* Uses BOOKMARKS_SLOW_DECAY_CONSTANT
*
* Features 1 and Feature 2 details:
*
* As an example, say a page was first seen on Day 1 and accessed from then
* until today (Day 4) with the following schedule:
*
* (Day 1, D times), (Day 2, C times), (Day 3, B times), (Day 4, A times)
*
* Then, the frequency+recency metric calculation for the page will be:
*
* FRMetric = A + (B * G) + (C * G^2) + (D * G^3)
*
* where G is the decay constant that takes values between 0 and 1.
* Values close to 1 lead to slow decay with age.
* Values close to 0 lead to fast decay with age.
*
* Feature 4 and Feature 5 only care about recency not frequency.
*
* So, if a bookmark was added X days earlier,
*
* Bookmark Feature Value = G^X.
*
* where G is the decay constant that takes values between 0 and 1.
* Values close to 1 lead to slow decay with age.
* Values close to 0 lead to fast decay with age.
*
* The rest of the url related features:
*
* Feature 6: Whether url ends in .htm or .html
* Feature 7: Is it a .com URL?
* Feature 8: Is it a .edu URL?
* Feature 9: Is it a .org URL?
* Feature 10: Is it a .net URL?
* Feature 11: Is it a .gov URL?
* Feature 12: Does the URL contain a ~ ?
* Feature 13: Does the URL start with http:* ?
* Feature 14: Does the URL start with ftp:// ?
* Feature 15: Does the URL start with file:// ?
* Feature 16: Does the URL start with gopher:// ?
* Feature 17: Does the URL start with https:// ?
* Feature 18: Does the host name end in a two letter country code?
* Feature 19: Number of /s in the URL.
* Feature 20: Number of ?s in the URL.
* Feature 21: Number of &s in the URL.
* Feature 22: Number of =s in the URL.
* Feature 23: Number of #s in the URL.
* Feature 24: Number of +s in the URL.
* Feature 25: Number of .s in the URL.
* Feature 26: Number of numerical [0-9] characters in the URL
* Feature 27: Number of alphabetical [a-zA-Z] characters in the URL
* Feature 28: Number of non-alphanumeric, non-[/?&=#+.] characters in the URL
* Feature 29: Number of .s in the hostname
* Feature 30: Number of numerical [0-9] characters in the hostname
* Feature 31: Number of alphabetical [a-zA-Z] characters in the hostname
* Feature 32: Number of non-alphanumeric, non-[/?&=#+.] characters in the hostname
* Feature 33: Number of .s in the hostname if we omit initial "www." or "ftp." (if any)
* Feature 34: Number of .s in the hostname if we omit ending ".XX" country code (if any)
* Feature 35: Number of .s in the hostname if we omit initial "www." or "ftp"
* and ending ".XX" country code (if any)
* Feature 36: Number of characters in URL
* Feature 37: Number of characters in hostname
* Feature 38: Number of characters in hostname excluding initial "www." or "ftp."
* Feature 39: Number of characters in URL excluding hostname
* Feature 40: Number of characters in web page title
* Feature 41: Is this a google search url?
* Feature 42: Is this a netscape search url?
* Feature 43: Is this a yahoo search url?
* Feature 44: Dummy input hardcoded to 1
*/
nsresult
nsGlobalHistory::FillInputFeatures(nsAString &aUrl,
PRFloat64 *aFeatures)
{
nsCOMPtr<nsIMdbRow> row;
nsresult rv = NS_OK;
PRInt32 ageInDays;
PRInt64 lastDate;
static nsCOMPtr<nsIBookmarksService> bs =
do_GetService(NS_BOOKMARKS_SERVICE_CONTRACTID, &rv);
nsCOMPtr<nsIURI> uri;
nsCAutoString curl, chost, cpath;
rv = NS_NewURI(getter_AddRefs(uri), aUrl);
if (NS_SUCCEEDED(rv) && uri) {
uri->GetSpec(curl);
uri->GetHost(chost);
uri->GetPath(cpath);
}
nsAutoString url(NS_ConvertUTF8toUCS2(curl).get());
nsAutoString path(NS_ConvertUTF8toUCS2(cpath).get());
nsAutoString host(NS_ConvertUTF8toUCS2(chost).get());
ToLowerCase(url);
ToLowerCase(host);
ToLowerCase(path);
// Calculate the input features for this training example.
rv = FindRow(kToken_URLColumn, curl.get(),
getter_AddRefs(row));
if (NS_FAILED(rv)) return rv;
// First, get the page in history related input features
rv = GetRowValue(row, kToken_FRFastDecayColumn, &aFeatures[0]);
if (NS_FAILED(rv)) return rv;
rv = GetRowValue(row, kToken_LastVisitDateColumn, &lastDate);
if (NS_FAILED(rv)) return rv;
ageInDays = GetAgeInDays(NormalizeTime(GetNow()), lastDate);
aFeatures[0] *= pow(HISTORY_FAST_DECAY_CONSTANT, (PRFloat64) ageInDays);
rv = GetRowValue(row, kToken_FRSlowDecayColumn, &aFeatures[1]);
if (NS_FAILED(rv)) return rv;
aFeatures[1] *= pow(HISTORY_SLOW_DECAY_CONSTANT, (PRFloat64) ageInDays);
aFeatures[2] = HasCell(mEnv, row, kToken_TypedColumn);
// Second, calculate the bookmark related input features.
aFeatures[3] = aFeatures[4] = 0;
if (bs) {
PRBool bookmarked;
rv = bs->IsBookmarked(curl.get(), &bookmarked);
if (NS_SUCCEEDED(rv) && bookmarked) {
// Get the date when the bookmark was added.
PRInt64 addDate;
nsCOMPtr<nsIRDFResource> rdfRes;
if (NS_SUCCEEDED(rv = gRDFService->GetResource(curl,
getter_AddRefs(rdfRes)))) {
nsCOMPtr<nsIRDFDataSource> bookmarkDS = do_QueryInterface(bs, &rv);
if (NS_SUCCEEDED(rv) && bookmarkDS) {
nsCOMPtr<nsIRDFNode> nodeType;
rv = bookmarkDS->GetTarget(rdfRes, kRDF_Type, PR_TRUE,
getter_AddRefs(nodeType));
if (NS_SUCCEEDED(rv)) {
if (nodeType == kNC_Bookmark) {
nsCOMPtr<nsIRDFNode> node;
rv = bookmarkDS->GetTarget(rdfRes, kNC_BookmarkAddDate, PR_TRUE,
getter_AddRefs(node));
if (rv != NS_RDF_NO_VALUE && node) {
nsCOMPtr<nsIRDFDate> rdfDate = do_QueryInterface(node, &rv);
if (NS_SUCCEEDED(rv) && rdfDate) {
rv = rdfDate->GetValue(&addDate);
}
}
}
}
}
}
if (NS_SUCCEEDED(rv)) {
ageInDays = GetAgeInDays(NormalizeTime(GetNow()), addDate);
aFeatures[3] = pow(BOOKMARK_FAST_DECAY_CONSTANT, ageInDays);
aFeatures[4] = pow(BOOKMARK_SLOW_DECAY_CONSTANT, ageInDays);
}
}
}
// Feature 6: Whether url ends in .htm or .html
nsAString::const_iterator start, end;
path.BeginReading(start);
path.EndReading(end);
aFeatures[5] = FindInReadable(NS_LITERAL_STRING(".htm"), start, end);
// Feature 7: Is it a .com URL?
host.BeginReading(start);
host.EndReading(end);
aFeatures[6] = FindInReadable(NS_LITERAL_STRING(".com"), start, end);
// Feature 8: Is it a .edu URL?
host.BeginReading(start);
host.EndReading(end);
aFeatures[7] = FindInReadable(NS_LITERAL_STRING(".edu"), start, end);
// Feature 9: Is it a .org URL?
host.BeginReading(start);
host.EndReading(end);
aFeatures[8] = FindInReadable(NS_LITERAL_STRING(".org"), start, end);
// Feature 10: Is it a .net URL?
host.BeginReading(start);
host.EndReading(end);
aFeatures[9] = FindInReadable(NS_LITERAL_STRING(".net"), start, end);
// Feature 11: Is it a .gov URL?
host.BeginReading(start);
host.EndReading(end);
aFeatures[10] = FindInReadable(NS_LITERAL_STRING(".gov"), start, end);
// Feature 12: Does the URL contain a ~ ?
path.BeginReading(start);
path.EndReading(end);
aFeatures[11] = FindInReadable(NS_LITERAL_STRING("~"), start, end);
// Feature 13: Does the URL start with http:// ?
PRBool isScheme;
aFeatures[12] = aFeatures[13] = aFeatures[14] = aFeatures[15] = aFeatures[16] = 0;
if (NS_SUCCEEDED(uri->SchemeIs("http", &isScheme))) {
aFeatures[12] = isScheme;
}
// Feature 14: Does the URL start with ftp:// ?
else if (NS_SUCCEEDED(uri->SchemeIs("ftp", &isScheme))) {
aFeatures[13] = isScheme;
}
// Feature 15: Does the URL start with file:// ?
else if (NS_SUCCEEDED(uri->SchemeIs("file", &isScheme))) {
aFeatures[14] = isScheme;
}
// Feature 16: Does the URL start with gopher:// ?
else if (NS_SUCCEEDED(uri->SchemeIs("gopher", &isScheme))) {
aFeatures[15] = isScheme;
}
// Feature 17: Does the URL start with https:// ?
else if (NS_SUCCEEDED(uri->SchemeIs("https", &isScheme))) {
aFeatures[16] = isScheme;
}
// Feature 18: Does the host name end in a two letter country code?
PRInt32 hostLength = host.Length();
if (host[hostLength - 1] == '.') {
// Skip trailing dots in hostname if it exists. This will catch cases like
// http://www.state.ca.us./state/portal/myca_homepage.jsp
aFeatures[17] = (host.RFindChar('.', hostLength - 2) == (hostLength - 4));
}
else {
aFeatures[17] = (host.RFindChar('.') == ((hostLength - 1) - 2));
}
// Feature 19: Number of /s in the URL.
aFeatures[18] = 0;
// Feature 20: Number of ?s in the URL.
aFeatures[19] = 0;
// Feature 21: Number of &s in the URL.
aFeatures[20] = 0;
// Feature 22: Number of =s in the URL.
aFeatures[21] = 0;
// Feature 23: Number of #s in the URL.
aFeatures[22] = 0;
// Feature 24: Number of +s in the URL.
aFeatures[23] = 0;
// Feature 25: Number of .s in the URL.
aFeatures[24] = 0;
// Feature 26: Number of numerical [0-9] characters in the URL
aFeatures[25] = 0;
// Feature 27: Number of alphabetical [a-zA-Z] characters in the URL
aFeatures[26] = 0;
// Feature 28: Number of non-alphanumeric, non-[/?&=#+.] characters in the URL
aFeatures[27] = 0;
url.BeginReading(start);
url.EndReading(end);
PRUint32 size, i;
for ( ; start != end; start.advance(size)) {
const PRUnichar* buf = start.get();
size = start.size_forward();
// fragment at 'buf' is 'size' characters long
for (i = 0; i < size; *buf++, i++) {
switch (*buf) {
case '/':
++aFeatures[18];
break;
case '?':
++aFeatures[19];
break;
case '&':
++aFeatures[20];
break;
case '=':
++aFeatures[21];
break;
case '#':
++aFeatures[22];
break;
case '+':
++aFeatures[23];
break;
case '.':
++aFeatures[24];
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
++aFeatures[25];
break;
default:
if (isalpha(*buf))
++aFeatures[26];
else
++aFeatures[27];
}
}
}
// Calculate a bunch of hostname related features.
// Feature 29: Number of .s in the hostname
aFeatures[28] = 0;
// Feature 30: Number of numerical [0-9] characters in the hostname
aFeatures[29] = 0;
// Feature 31: Number of alphabetical [a-zA-Z] characters in the hostname
aFeatures[30] = 0;
// Feature 32: Number of non-alphanumeric, non-[.] characters in the hostname
aFeatures[31] = 0;
size = chost.Length();
for (i = 0; i < size; i++) {
switch (chost[i]) {
case '.':
++aFeatures[28];
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
++aFeatures[29];
break;
default:
if (isalpha(chost[i]))
++aFeatures[30];
else
++aFeatures[31];
}
}
// Feature 33: Number of .s in the hostname if we omit initial "www." or "ftp." (if any)
aFeatures[32] = aFeatures[28];
// Feature 34: Number of .s in the hostname if we omit ending ".XX" country code (if any)
aFeatures[33] = aFeatures[28];
// Feature 35: Number of .s in the hostname if we omit initial "www." or "ftp"
// and ending ".XX" country code (if any)
aFeatures[34] = aFeatures[28];
// Feature 36: Number of characters in hostname
aFeatures[35] = chost.Length();
// Feature 37: Number of characters in hostname excluding initial "www." or "ftp."
aFeatures[36] = aFeatures[35];
if (chost.Find("www.") == 0 || chost.Find("ftp.") == 0) {
--aFeatures[32];
--aFeatures[34];
aFeatures[36] -= 4;
}
if (aFeatures[17]) {
--aFeatures[33];
--aFeatures[34];
}
// Feature 38: Number of characters in URL
aFeatures[37] = url.Length();
// Feature 39: Number of characters in URL excluding hostname
aFeatures[38] = aFeatures[37] - aFeatures[35];
// Feature 40: Number of characters in web page title
nsAutoString title;
rv = GetRowValue(row, kToken_NameColumn, title);
if (NS_FAILED(rv)) return rv;
aFeatures[39] = title.Length();
// Feature 41: Is this a google search url?
url.BeginReading(start);
url.EndReading(end);
aFeatures[40] = FindInReadable(NS_LITERAL_STRING("http://www.google.com/search"), start, end);
// Feature 42: Is this a netscape search url?
url.BeginReading(start);
url.EndReading(end);
aFeatures[41] = FindInReadable(NS_LITERAL_STRING("http://search.netscape.com/nscp_results.adp"), start, end);
// Feature 43: Is this a yahoo search url?
url.BeginReading(start);
url.EndReading(end);
aFeatures[42] = FindInReadable(NS_LITERAL_STRING("http://search.yahoo.com/bin/search"), start, end);
// Feature 44: This is a dummy input hardcoded to 1. It allows
// the perceptron to represent functions that do not pass through the
// origin.
aFeatures[43] = 1;
return rv;
}
nsresult
nsGlobalHistory::WriteURLData(nsAString& aURL, PRFloat64* aURLFeatures)
{
nsCOMPtr<nsIMdbRow> row;
nsresult rv = NS_OK;
nsCAutoString dateStr, IDStr;
PRInt64 rowID;
if (!mURLDataFile || !aURLFeatures)
return NS_ERROR_FAILURE;
// Calculate the input features for this training example.
rv = FindRowAndID(kToken_URLColumn, NS_ConvertUCS2toUTF8(aURL).get(),
getter_AddRefs(row), &rowID);
if (NS_FAILED(rv)) return rv;
if (!rowID) {
AssignUniqueURLID(row, &rowID);
}
PRInt64ToChars(rowID, IDStr);
fprintf(mURLDataFile, "<url id='%s'", IDStr.get());
if (mDataCaptureMode == URLDATACAPTURE_WITH_URL_INFO) {
fprintf(mURLDataFile, " path='%s'", NS_ConvertUCS2toUTF8(aURL).get());
}
PRInt64ToChars(PR_Now(), dateStr);
fprintf(mURLDataFile, " time='%s'>\n", dateStr.get());
PRInt32 i;
for (i = 0; i < AC_NUM_URL_FEATURES - 1; i++) {
fprintf(mURLDataFile, "%.2f, ", aURLFeatures[i]);
}
fprintf(mURLDataFile, "%.2f\n</url>\n", aURLFeatures[i]);
return NS_OK;
}
=======
>>>>>>> 1.173
NS_IMETHODIMP
nsGlobalHistory::OnAutoComplete(const PRUnichar *searchString,
nsIAutoCompleteResults *previousSearchResult,