Makes langtag.net valid-tags tests pass

pull/10/head
Tpt 6 years ago
parent 1a71bae5e2
commit 01c1e812f5
  1. 345
      lib/src/model/language_tag.rs

@ -377,17 +377,24 @@ impl LanguageTag {
}
// There are no duplicate singleton (extension) subtags.
if let Some(extension) = self.extension() {
let mut seen_extensions = AlphanumericLowerCharSet::new();
if self.extension_subtags().any(|(extension, _)| {
if extension.split('-').any(|subtag| {
if subtag.len() == 1 {
let extension = subtag.chars().next().unwrap();
if seen_extensions.contains(extension) {
true
} else {
seen_extensions.insert(extension);
false
}
} else {
false
}
}) {
return Err(ValidationError::DuplicateExtension);
}
}
// There is no more than one extended language subtag.
// From [errata 5457](https://www.rfc-editor.org/errata/eid5457).
@ -677,124 +684,6 @@ impl fmt::Display for ValidationError {
}
}
#[test]
fn test_wellformed_tags() {
// Source: http://www.langtag.net/test-suites/well-formed-tags.txt
let tags = vec![
"fr",
"fr-Latn",
"fr-fra", // Extended tag
"fr-Latn-FR",
"fr-Latn-419",
"fr-FR",
"ax-TZ", // Not in the registry, but well-formed
"fr-shadok", // Variant
"fr-y-myext-myext2",
"fra-Latn", // ISO 639 can be 3-letters
"fra",
"fra-FX",
"i-klingon", // grandfathered with singleton
"I-kLINgon", // tags are case-insensitive...
"no-bok", // grandfathered without singleton
"fr-Lat", // Extended",
"mn-Cyrl-MN",
"mN-cYrL-Mn",
"fr-Latn-CA",
"en-US",
"fr-Latn-CA",
"i-enochian", // Grand fathered
"x-fr-CH",
"sr-Latn-CS",
"es-419",
"sl-nedis",
"de-CH-1996",
"de-Latg-1996",
"sl-IT-nedis",
"en-a-bbb-x-a-ccc",
"de-a-value",
"en-Latn-GB-boont-r-extended-sequence-x-private",
"en-x-US",
"az-Arab-x-AZE-derbend",
"es-Latn-CO-x-private",
"en-US-boont",
"ab-x-abc-x-abc", // anything goes after x
"ab-x-abc-a-a", // ditto",
"i-default", // grandfathered",
"i-klingon", // grandfathered",
"abcd-Latn", // Language of 4 chars reserved for future use
"AaBbCcDd-x-y-any-x", // Language of 5-8 chars, registered
"en",
"de-AT",
"es-419",
"de-CH-1901",
"sr-Cyrl",
"sr-Cyrl-CS",
"sl-Latn-IT-rozaj",
"en-US-x-twain",
"zh-cmn",
"zh-cmn-Hant",
"zh-cmn-Hant-HK",
"zh-gan",
"zh-yue-Hant-HK",
"xr-lxs-qut", // extlangS
"xr-lqt-qu", // extlang + region
"xr-p-lze", // Extension
];
for tag in tags {
let result = LanguageTag::from_str(tag);
assert!(
result.is_ok(),
"{} should be considered well-formed but returned error {}",
tag,
result.err().unwrap()
);
}
}
#[test]
fn test_broken_tags() {
// http://www.langtag.net/test-suites/broken-tags.txt
let tags = vec![
"f",
"f-Latn",
"fr-Latn-F",
"a-value",
// "en-a-bbb-a-ccc", // 'a' appears twice, commented out because well-formed
"tlh-a-b-foo",
"i-notexist", // grandfathered but not registered: always invalid
"abcdefghi-012345678",
"ab-abc-abc-abc-abc",
"ab-abcd-abc",
"ab-ab-abc",
"ab-123-abc",
"a-Hant-ZH",
"a1-Hant-ZH",
"ab-abcde-abc",
"ab-1abc-abc",
"ab-ab-abcd",
"ab-123-abcd",
"ab-abcde-abcd",
"ab-1abc-abcd",
"ab-a-b",
"ab-a-x",
"ab--ab",
"ab-abc-",
"-ab-abc",
// "ab-c-abc-r-toto-c-abc # 'c' appears twice ", commented out because well-formed
"abcd-efg",
"aabbccddE",
];
for tag in tags {
let result = LanguageTag::from_str(tag);
assert!(
result.is_err(),
"{} should be considered not well-formed but returned result {:?}",
tag,
result.ok().unwrap()
);
}
}
// Tests from RFC 5646 2.1.1
#[test]
fn test_formatting() {
@ -1031,3 +920,221 @@ fn test_is_valid() {
.unwrap()
.is_valid());
}
// http://www.langtag.net/test-suites/well-formed-tags.txt
#[test]
fn test_wellformed_tags() {
let tags = vec![
"fr",
"fr-Latn",
"fr-fra", // Extended tag
"fr-Latn-FR",
"fr-Latn-419",
"fr-FR",
"ax-TZ", // Not in the registry, but well-formed
"fr-shadok", // Variant
"fr-y-myext-myext2",
"fra-Latn", // ISO 639 can be 3-letters
"fra",
"fra-FX",
"i-klingon", // grandfathered with singleton
"I-kLINgon", // tags are case-insensitive...
"no-bok", // grandfathered without singleton
"fr-Lat", // Extended",
"mn-Cyrl-MN",
"mN-cYrL-Mn",
"fr-Latn-CA",
"en-US",
"fr-Latn-CA",
"i-enochian", // Grand fathered
"x-fr-CH",
"sr-Latn-CS",
"es-419",
"sl-nedis",
"de-CH-1996",
"de-Latg-1996",
"sl-IT-nedis",
"en-a-bbb-x-a-ccc",
"de-a-value",
"en-Latn-GB-boont-r-extended-sequence-x-private",
"en-x-US",
"az-Arab-x-AZE-derbend",
"es-Latn-CO-x-private",
"en-US-boont",
"ab-x-abc-x-abc", // anything goes after x
"ab-x-abc-a-a", // ditto",
"i-default", // grandfathered",
"i-klingon", // grandfathered",
"abcd-Latn", // Language of 4 chars reserved for future use
"AaBbCcDd-x-y-any-x", // Language of 5-8 chars, registered
"en",
"de-AT",
"es-419",
"de-CH-1901",
"sr-Cyrl",
"sr-Cyrl-CS",
"sl-Latn-IT-rozaj",
"en-US-x-twain",
"zh-cmn",
"zh-cmn-Hant",
"zh-cmn-Hant-HK",
"zh-gan",
"zh-yue-Hant-HK",
"xr-lxs-qut", // extlangS
"xr-lqt-qu", // extlang + region
"xr-p-lze", // Extension
];
for tag in tags {
let result = LanguageTag::from_str(tag);
assert!(
result.is_ok(),
"{} should be considered well-formed but returned error {}",
tag,
result.err().unwrap()
);
}
}
// http://www.langtag.net/test-suites/broken-tags.txt
#[test]
fn test_broken_tags() {
let tags = vec![
"f",
"f-Latn",
"fr-Latn-F",
"a-value",
// "en-a-bbb-a-ccc", # 'a' appears twice, commented out because well-formed
"tlh-a-b-foo",
"i-notexist", // grandfathered but not registered: always invalid
"abcdefghi-012345678",
"ab-abc-abc-abc-abc",
"ab-abcd-abc",
"ab-ab-abc",
"ab-123-abc",
"a-Hant-ZH",
"a1-Hant-ZH",
"ab-abcde-abc",
"ab-1abc-abc",
"ab-ab-abcd",
"ab-123-abcd",
"ab-abcde-abcd",
"ab-1abc-abcd",
"ab-a-b",
"ab-a-x",
"ab--ab",
"ab-abc-",
"-ab-abc",
// "ab-c-abc-r-toto-c-abc # 'c' appears twice ", commented out because well-formed
"abcd-efg",
"aabbccddE",
];
for tag in tags {
let result = LanguageTag::from_str(tag);
assert!(
result.is_err(),
"{} should be considered not well-formed but returned result {:?}",
tag,
result.ok().unwrap()
);
}
}
// http://www.langtag.net/test-suites/valid-tags.txt
#[test]
fn test_valid_tags() {
let tags = vec![
"fr",
"fr-Latn",
"fr-fra", // Extended tag
"fr-Latn-FR",
"fr-Latn-419",
"fr-FR",
"fr-y-myext-myext2",
"apa-Latn", // ISO 639 can be 3-letters
"apa",
"apa-CA",
"i-klingon", // grandfathered with singleton
"no-bok", // grandfathered without singleton
"fr-Lat", // Extended
"mn-Cyrl-MN",
"mN-cYrL-Mn",
"fr-Latn-CA",
"en-US",
"fr-Latn-CA",
"i-enochian", // Grand fathered
"x-fr-CH",
"sr-Latn-CS",
"es-419",
"sl-nedis",
"de-CH-1996",
"de-Latg-1996",
"sl-IT-nedis",
"en-a-bbb-x-a-ccc",
"de-a-value",
"en-x-US",
"az-Arab-x-AZE-derbend",
"es-Latn-CO-x-private",
"ab-x-abc-x-abc", // anything goes after x
"ab-x-abc-a-a", // ditto
"i-default", // grandfathered
"i-klingon", // grandfathered
"en",
"de-AT",
"es-419",
"de-CH-1901",
"sr-Cyrl",
"sr-Cyrl-CS",
"sl-Latn-IT-rozaj",
"en-US-x-twain",
"zh-cmn",
"zh-cmn-Hant",
"zh-cmn-Hant-HK",
"zh-gan",
"zh-yue-Hant-HK",
"en-Latn-GB-boont-r-extended-sequence-x-private",
"en-US-boont",
];
for tag in tags {
let result = LanguageTag::from_str(tag);
assert!(
result.is_ok(),
"{} should be considered well-formed but returned error {}",
tag,
result.err().unwrap()
);
let validation = result.unwrap().validate();
assert!(
validation.is_ok(),
"{} should be considered valid but returned error {}",
tag,
validation.err().unwrap()
);
}
}
// http://www.langtag.net/test-suites/invalid-tags.txt
#[test]
fn test_invalid_tags() {
let tags = vec![
//TODO "ax-TZ", // Not in the registry, but well-formed
//TODO "fra-Latn", // ISO 639 can be 3-letters
//TODO "fra",
//TODO "fra-FX",
//TODO "abcd-Latn", // Language of 4 chars reserved for future use
//TODO "AaBbCcDd-x-y-any-x", // Language of 5-8 chars, registered
//TODO "zh-Latm-CN", // Typo
//TODO "de-DE-1902", // Wrong variant
//TODO "fr-shadok", // Variant
];
for tag in tags {
let result = LanguageTag::from_str(tag);
assert!(
result.is_ok(),
"{} should be considered well-formed but returned error {}",
tag,
result.err().unwrap()
);
let validation = result.unwrap().validate();
assert!(validation.is_err(), "{} should be considered invalid", tag);
}
}

Loading…
Cancel
Save