fix: reduce false positives in award detection

Filter SPARQL query to only return entertainment awards (film, TV,
music, theater) and add a canonical award map to normalize variants
(e.g. all Oscar/Academy Award → "Oscar", all Golden Globe → "Golden
Globe"). Non-entertainment awards (orders, medals, honorary degrees)
are excluded both at SPARQL level and via PHP keyword filter.

Also restart messenger container on cache:clear to avoid stale DI
container errors.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
thibaud-leclere
2026-04-01 21:12:28 +02:00
parent 116d7b409e
commit 295bb16ab7
4 changed files with 255 additions and 7 deletions

View File

@@ -91,6 +91,18 @@ class WikidataGateway
?awardStatement ps:P166 ?award .
?awardStatement pq:P585 ?date .
BIND(YEAR(?date) AS ?year)
# Only keep entertainment awards (film, TV, music, theater, performing arts)
VALUES ?awardSuperclass {
wd:Q4220920 # film award
wd:Q1407443 # television award
wd:Q2235858 # music award
wd:Q15056993 # film festival award
wd:Q15383322 # theater award
wd:Q29461289 # performing arts award
}
?award wdt:P31/wdt:P279* ?awardSuperclass .
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en" . }
}
ORDER BY ?name DESC(?year)

View File

@@ -14,6 +14,76 @@ use Psr\Log\LoggerInterface;
readonly class AwardImporter
{
/**
* Canonical award name => keywords to match (case-insensitive).
* Checked in order — first match wins.
*/
private const AWARD_MAP = [
'Oscar' => ['Academy Award', 'Oscar'],
'Golden Globe' => ['Golden Globe', 'Golden Globes'],
'BAFTA' => ['BAFTA', 'British Academy Film Award', 'British Academy Television Award', 'British Academy Games Award'],
'César' => ['César'],
'SAG' => ['Screen Actors Guild'],
'Emmy' => ['Emmy Award', 'Primetime Emmy'],
'Tony' => ['Tony Award', 'Tony award'],
'Grammy' => ['Grammy'],
'Cannes' => ['Festival de Cannes', 'Cannes', "Palme d'or", "Caméra d'or"],
'Sundance' => ['Sundance'],
'Berlinale' => ['Berlinale', 'Berliner Bär', "Ours d'argent", "Ours d'or"],
'Mostra de Venise' => ['Mostra', 'Venice Film Festival', 'Coupe Volpi', "Lion d'or"],
'Saturn' => ['Saturn Award'],
'MTV' => ['MTV Movie', 'MTV Video'],
"Critics' Choice" => ["Critics' Choice"],
'Independent Spirit' => ['Independent Spirit'],
'Annie' => ['Annie Award'],
'Goya' => ['prix Goya', 'Goya Award'],
'Laurence Olivier' => ['Laurence Olivier', 'Olivier Award'],
'David di Donatello' => ['David di Donatello'],
'Gotham' => ['Gotham Award', 'Gotham Independent'],
'NAACP Image' => ['NAACP Image'],
"People's Choice" => ["People's Choice"],
'Teen Choice' => ['Teen Choice'],
'BET' => ['BET Award', 'BET Her', 'BET YoungStars'],
'Black Reel' => ['Black Reel'],
'National Board of Review' => ['National Board of Review'],
'New York Film Critics Circle' => ['New York Film Critics Circle'],
'Los Angeles Film Critics' => ['Los Angeles Film Critics'],
'San Sebastián' => ['Donostia', 'San Sebastián'],
'Sitges' => ['Sitges'],
'Satellite' => ['Satellite Award'],
'Lucille Lortel' => ['Lucille Lortel'],
'Golden Raspberry' => ['Golden Raspberry', 'Razzie'],
'Drama Desk' => ['Drama Desk'],
'Genie' => ['Genie Award'],
'European Film Award' => ['prix du cinéma européen', 'European Film Award'],
'AACTA' => ['AACTA'],
];
/**
* Keywords indicating non-entertainment awards (case-insensitive).
* These slip through even with the SPARQL filter.
*/
private const EXCLUDED_KEYWORDS = [
// National orders and decorations
'chevalier', 'officier', 'commandeur', 'compagnon',
'ordre du', 'ordre de', 'order of the',
'grand-croix', 'grand officier', 'grand cordon',
'Knight Bachelor', 'Knight Commander',
'croix d\'',
// Honorary degrees and memberships
'honoris causa',
'membre de l\'', 'membre de la', 'membre honoraire', 'membre associé', 'membre élu',
'Fellow of', 'fellow de',
// Scholarships
'bourse ',
// Medals (military, scientific, etc.)
'médaille', 'Medal',
// Other non-entertainment
'Time 100', '100 Women', 'All-NBA',
'étoile du Hollywood Walk of Fame',
'allée des célébrités',
];
public function __construct(
private WikidataGateway $wikidataGateway,
private AwardTypeRepository $awardTypeRepository,
@@ -54,6 +124,10 @@ readonly class AwardImporter
$wikidataAwards = $allAwards[$actor->getName()] ?? [];
foreach ($wikidataAwards as $wikidataAward) {
if ($this->isExcluded($wikidataAward['name'])) {
continue;
}
$awardType = $this->resolveAwardType($wikidataAward['name'], $knownTypes);
$award = new Award();
@@ -74,12 +148,33 @@ readonly class AwardImporter
*/
private function resolveAwardType(string $awardName, array &$knownTypes): AwardType
{
// 1. Try canonical map first
$canonicalName = $this->findCanonicalName($awardName);
if (null !== $canonicalName) {
foreach ($knownTypes as $type) {
if ($type->getName() === $canonicalName) {
return $type;
}
}
$newType = new AwardType();
$newType->setName($canonicalName);
$newType->setPattern($canonicalName);
$this->em->persist($newType);
$knownTypes[] = $newType;
return $newType;
}
// 2. Fall back to existing pattern matching
foreach ($knownTypes as $type) {
if (str_contains($awardName, $type->getPattern())) {
return $type;
}
}
// 3. Create new type with prefix extraction
$newType = new AwardType();
$prefix = $this->extractPrefix($awardName);
$newType->setName($prefix);
@@ -91,10 +186,43 @@ readonly class AwardImporter
return $newType;
}
private function findCanonicalName(string $awardName): ?string
{
$normalized = mb_strtolower($awardName);
foreach (self::AWARD_MAP as $canonical => $keywords) {
foreach ($keywords as $keyword) {
if (str_contains($normalized, mb_strtolower($keyword))) {
return $canonical;
}
}
}
return null;
}
private function isExcluded(string $awardName): bool
{
$normalized = mb_strtolower($awardName);
foreach (self::EXCLUDED_KEYWORDS as $keyword) {
if (str_contains($normalized, mb_strtolower($keyword))) {
return true;
}
}
return false;
}
private function extractPrefix(string $awardName): string
{
// Extract text before " for " or " pour " (common patterns in award names)
if (preg_match('/^(.+?)\s+(?:for|pour)\s+/i', $awardName, $matches)) {
// "X for Y", "X pour Y", "X du Y", "X de la Y", "X de l'Y", "X des Y"
if (preg_match('/^(.+?)\s+(?:for|pour|du|de la|de l\'|des)\s+/iu', $awardName, $matches)) {
return trim($matches[1]);
}
// "... festival de cinéma de X" or "... festival de X"
if (preg_match('/festival\s+(?:de\s+(?:cinéma\s+de\s+)?)?(.+?)$/iu', $awardName, $matches)) {
return trim($matches[1]);
}