From 295bb16ab7df28983d03e83f7c8637dde66282d7 Mon Sep 17 00:00:00 2001 From: thibaud-leclere Date: Wed, 1 Apr 2026 21:12:28 +0200 Subject: [PATCH] fix: reduce false positives in award detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filter SPARQL query to only return entertainment awards (film, TV, music, theater) and add a canonical award map to normalize variants (e.g. all Oscar/Academy Award → "Oscar", all Golden Globe → "Golden Globe"). Non-entertainment awards (orders, medals, honorary degrees) are excluded both at SPARQL level and via PHP keyword filter. Also restart messenger container on cache:clear to avoid stale DI container errors. Co-Authored-By: Claude Opus 4.6 (1M context) --- Makefile | 1 + src/Gateway/WikidataGateway.php | 12 +++ src/Import/AwardImporter.php | 132 ++++++++++++++++++++++++++++- tests/Import/AwardImporterTest.php | 117 +++++++++++++++++++++++-- 4 files changed, 255 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 0fd9f74..4224099 100644 --- a/Makefile +++ b/Makefile @@ -60,6 +60,7 @@ php\:console: ## Lance bin/console avec arguments (ex: make php:console -- cache symfony\:cache-clear: ## Vide le cache Symfony docker compose exec app php bin/console cache:clear + docker compose restart messenger test: ## Lance les tests PHPUnit docker compose exec app php bin/phpunit diff --git a/src/Gateway/WikidataGateway.php b/src/Gateway/WikidataGateway.php index 7e69e1f..83f97b5 100644 --- a/src/Gateway/WikidataGateway.php +++ b/src/Gateway/WikidataGateway.php @@ -91,6 +91,18 @@ class WikidataGateway ?awardStatement ps:P166 ?award . ?awardStatement pq:P585 ?date . BIND(YEAR(?date) AS ?year) + + # Only keep entertainment awards (film, TV, music, theater, performing arts) + VALUES ?awardSuperclass { + wd:Q4220920 # film award + wd:Q1407443 # television award + wd:Q2235858 # music award + wd:Q15056993 # film festival award + wd:Q15383322 # theater award + wd:Q29461289 # performing arts award + } + ?award wdt:P31/wdt:P279* ?awardSuperclass . + SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en" . } } ORDER BY ?name DESC(?year) diff --git a/src/Import/AwardImporter.php b/src/Import/AwardImporter.php index e3553b2..c8d26d2 100644 --- a/src/Import/AwardImporter.php +++ b/src/Import/AwardImporter.php @@ -14,6 +14,76 @@ use Psr\Log\LoggerInterface; readonly class AwardImporter { + /** + * Canonical award name => keywords to match (case-insensitive). + * Checked in order — first match wins. + */ + private const AWARD_MAP = [ + 'Oscar' => ['Academy Award', 'Oscar'], + 'Golden Globe' => ['Golden Globe', 'Golden Globes'], + 'BAFTA' => ['BAFTA', 'British Academy Film Award', 'British Academy Television Award', 'British Academy Games Award'], + 'César' => ['César'], + 'SAG' => ['Screen Actors Guild'], + 'Emmy' => ['Emmy Award', 'Primetime Emmy'], + 'Tony' => ['Tony Award', 'Tony award'], + 'Grammy' => ['Grammy'], + 'Cannes' => ['Festival de Cannes', 'Cannes', "Palme d'or", "Caméra d'or"], + 'Sundance' => ['Sundance'], + 'Berlinale' => ['Berlinale', 'Berliner Bär', "Ours d'argent", "Ours d'or"], + 'Mostra de Venise' => ['Mostra', 'Venice Film Festival', 'Coupe Volpi', "Lion d'or"], + 'Saturn' => ['Saturn Award'], + 'MTV' => ['MTV Movie', 'MTV Video'], + "Critics' Choice" => ["Critics' Choice"], + 'Independent Spirit' => ['Independent Spirit'], + 'Annie' => ['Annie Award'], + 'Goya' => ['prix Goya', 'Goya Award'], + 'Laurence Olivier' => ['Laurence Olivier', 'Olivier Award'], + 'David di Donatello' => ['David di Donatello'], + 'Gotham' => ['Gotham Award', 'Gotham Independent'], + 'NAACP Image' => ['NAACP Image'], + "People's Choice" => ["People's Choice"], + 'Teen Choice' => ['Teen Choice'], + 'BET' => ['BET Award', 'BET Her', 'BET YoungStars'], + 'Black Reel' => ['Black Reel'], + 'National Board of Review' => ['National Board of Review'], + 'New York Film Critics Circle' => ['New York Film Critics Circle'], + 'Los Angeles Film Critics' => ['Los Angeles Film Critics'], + 'San Sebastián' => ['Donostia', 'San Sebastián'], + 'Sitges' => ['Sitges'], + 'Satellite' => ['Satellite Award'], + 'Lucille Lortel' => ['Lucille Lortel'], + 'Golden Raspberry' => ['Golden Raspberry', 'Razzie'], + 'Drama Desk' => ['Drama Desk'], + 'Genie' => ['Genie Award'], + 'European Film Award' => ['prix du cinéma européen', 'European Film Award'], + 'AACTA' => ['AACTA'], + ]; + + /** + * Keywords indicating non-entertainment awards (case-insensitive). + * These slip through even with the SPARQL filter. + */ + private const EXCLUDED_KEYWORDS = [ + // National orders and decorations + 'chevalier', 'officier', 'commandeur', 'compagnon', + 'ordre du', 'ordre de', 'order of the', + 'grand-croix', 'grand officier', 'grand cordon', + 'Knight Bachelor', 'Knight Commander', + 'croix d\'', + // Honorary degrees and memberships + 'honoris causa', + 'membre de l\'', 'membre de la', 'membre honoraire', 'membre associé', 'membre élu', + 'Fellow of', 'fellow de', + // Scholarships + 'bourse ', + // Medals (military, scientific, etc.) + 'médaille', 'Medal', + // Other non-entertainment + 'Time 100', '100 Women', 'All-NBA', + 'étoile du Hollywood Walk of Fame', + 'allée des célébrités', + ]; + public function __construct( private WikidataGateway $wikidataGateway, private AwardTypeRepository $awardTypeRepository, @@ -54,6 +124,10 @@ readonly class AwardImporter $wikidataAwards = $allAwards[$actor->getName()] ?? []; foreach ($wikidataAwards as $wikidataAward) { + if ($this->isExcluded($wikidataAward['name'])) { + continue; + } + $awardType = $this->resolveAwardType($wikidataAward['name'], $knownTypes); $award = new Award(); @@ -74,12 +148,33 @@ readonly class AwardImporter */ private function resolveAwardType(string $awardName, array &$knownTypes): AwardType { + // 1. Try canonical map first + $canonicalName = $this->findCanonicalName($awardName); + + if (null !== $canonicalName) { + foreach ($knownTypes as $type) { + if ($type->getName() === $canonicalName) { + return $type; + } + } + + $newType = new AwardType(); + $newType->setName($canonicalName); + $newType->setPattern($canonicalName); + $this->em->persist($newType); + $knownTypes[] = $newType; + + return $newType; + } + + // 2. Fall back to existing pattern matching foreach ($knownTypes as $type) { if (str_contains($awardName, $type->getPattern())) { return $type; } } + // 3. Create new type with prefix extraction $newType = new AwardType(); $prefix = $this->extractPrefix($awardName); $newType->setName($prefix); @@ -91,10 +186,43 @@ readonly class AwardImporter return $newType; } + private function findCanonicalName(string $awardName): ?string + { + $normalized = mb_strtolower($awardName); + + foreach (self::AWARD_MAP as $canonical => $keywords) { + foreach ($keywords as $keyword) { + if (str_contains($normalized, mb_strtolower($keyword))) { + return $canonical; + } + } + } + + return null; + } + + private function isExcluded(string $awardName): bool + { + $normalized = mb_strtolower($awardName); + + foreach (self::EXCLUDED_KEYWORDS as $keyword) { + if (str_contains($normalized, mb_strtolower($keyword))) { + return true; + } + } + + return false; + } + private function extractPrefix(string $awardName): string { - // Extract text before " for " or " pour " (common patterns in award names) - if (preg_match('/^(.+?)\s+(?:for|pour)\s+/i', $awardName, $matches)) { + // "X for Y", "X pour Y", "X du Y", "X de la Y", "X de l'Y", "X des Y" + if (preg_match('/^(.+?)\s+(?:for|pour|du|de la|de l\'|des)\s+/iu', $awardName, $matches)) { + return trim($matches[1]); + } + + // "... festival de cinéma de X" or "... festival de X" + if (preg_match('/festival\s+(?:de\s+(?:cinéma\s+de\s+)?)?(.+?)$/iu', $awardName, $matches)) { return trim($matches[1]); } diff --git a/tests/Import/AwardImporterTest.php b/tests/Import/AwardImporterTest.php index f4ae9ea..63d014e 100644 --- a/tests/Import/AwardImporterTest.php +++ b/tests/Import/AwardImporterTest.php @@ -53,7 +53,7 @@ class AwardImporterTest extends TestCase ]); $existingType = new AwardType(); - $existingType->setName('Oscar')->setPattern('Academy Award'); + $existingType->setName('Oscar')->setPattern('Oscar'); $this->awardTypeRepository->method('findAll')->willReturn([$existingType]); @@ -73,7 +73,7 @@ class AwardImporterTest extends TestCase $this->assertSame($actor, $persisted[0]->getActor()); } - public function testCreatesNewAwardTypeWhenNoPatternMatches(): void + public function testCanonicalMapGroupsRelatedAwards(): void { $actor = $this->createActorWithFlag('Test Actor', awardsImported: false); @@ -98,14 +98,67 @@ class AwardImporterTest extends TestCase $newType = $persisted[0]; $this->assertInstanceOf(AwardType::class, $newType); - $this->assertSame('Screen Actors Guild Award', $newType->getName()); - $this->assertSame('Screen Actors Guild Award', $newType->getPattern()); + $this->assertSame('SAG', $newType->getName()); + $this->assertSame('SAG', $newType->getPattern()); $award = $persisted[1]; $this->assertInstanceOf(Award::class, $award); $this->assertSame($newType, $award->getAwardType()); } + public function testFallsBackToExtractPrefixWhenNotInCanonicalMap(): void + { + $actor = $this->createActorWithFlag('Test Actor', awardsImported: false); + + $this->wikidataGateway->method('getAwardsForActors')->willReturn([ + 'Test Actor' => [ + ['name' => 'Bambi for Best Film', 'year' => 2019], + ], + ]); + + $this->awardTypeRepository->method('findAll')->willReturn([]); + + $persisted = []; + $this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) { + $persisted[] = $entity; + }); + + $this->importer->importForActors([$actor]); + + $newType = $persisted[0]; + $this->assertInstanceOf(AwardType::class, $newType); + $this->assertSame('Bambi', $newType->getName()); + } + + public function testExcludesNonEntertainmentAwards(): void + { + $actor = $this->createActorWithFlag('Test Actor', awardsImported: false); + + $this->wikidataGateway->method('getAwardsForActors')->willReturn([ + 'Test Actor' => [ + ['name' => 'chevalier de la Légion d\'honneur', 'year' => 2015], + ['name' => 'docteur honoris causa', 'year' => 2018], + ['name' => 'bourse Rhodes', 'year' => 2010], + ['name' => 'Oscar du meilleur acteur', 'year' => 2020], + ], + ]); + + $this->awardTypeRepository->method('findAll')->willReturn([]); + + $persisted = []; + $this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) { + $persisted[] = $entity; + }); + + $this->importer->importForActors([$actor]); + + // Only the Oscar should be persisted (1 AwardType + 1 Award) + $this->assertCount(2, $persisted); + $this->assertInstanceOf(AwardType::class, $persisted[0]); + $this->assertSame('Oscar', $persisted[0]->getName()); + $this->assertInstanceOf(Award::class, $persisted[1]); + } + public function testDoesNotSetFlagOnWikidataError(): void { $actor = $this->createActorWithFlag('Test Actor', awardsImported: false); @@ -146,7 +199,7 @@ class AwardImporterTest extends TestCase ]); $existingType = new AwardType(); - $existingType->setName('Oscar')->setPattern('Academy Award'); + $existingType->setName('Oscar')->setPattern('Oscar'); $this->awardTypeRepository->method('findAll')->willReturn([$existingType]); @@ -163,6 +216,60 @@ class AwardImporterTest extends TestCase $this->assertCount(3, $persisted); } + public function testExtractPrefixHandlesFrenchPatterns(): void + { + $actor = $this->createActorWithFlag('Test Actor', awardsImported: false); + + $this->wikidataGateway->method('getAwardsForActors')->willReturn([ + 'Test Actor' => [ + ['name' => 'Bodil du meilleur acteur', 'year' => 2019], + ], + ]); + + $this->awardTypeRepository->method('findAll')->willReturn([]); + + $persisted = []; + $this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) { + $persisted[] = $entity; + }); + + $this->importer->importForActors([$actor]); + + $newType = $persisted[0]; + $this->assertInstanceOf(AwardType::class, $newType); + $this->assertSame('Bodil', $newType->getName()); + } + + public function testCanonicalMapReusesExistingType(): void + { + $actor = $this->createActorWithFlag('Test Actor', awardsImported: false); + + $this->wikidataGateway->method('getAwardsForActors')->willReturn([ + 'Test Actor' => [ + ['name' => 'oscar du meilleur acteur', 'year' => 2020], + ['name' => 'Oscar de la meilleure actrice', 'year' => 2021], + ], + ]); + + $existingOscar = new AwardType(); + $existingOscar->setName('Oscar')->setPattern('Oscar'); + + $this->awardTypeRepository->method('findAll')->willReturn([$existingOscar]); + + $persisted = []; + $this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) { + $persisted[] = $entity; + }); + + $this->importer->importForActors([$actor]); + + // Both awards should reuse the same "Oscar" type — only 2 Awards persisted, no new AwardType + $this->assertCount(2, $persisted); + $this->assertContainsOnlyInstancesOf(Award::class, $persisted); + $this->assertSame($existingOscar, $persisted[0]->getAwardType()); + $this->assertSame($existingOscar, $persisted[1]->getAwardType()); + } + private function createActorWithFlag(string $name, bool $awardsImported): Actor { $actor = new Actor();