fix: reduce false positives in award detection
Filter SPARQL query to only return entertainment awards (film, TV, music, theater) and add a canonical award map to normalize variants (e.g. all Oscar/Academy Award → "Oscar", all Golden Globe → "Golden Globe"). Non-entertainment awards (orders, medals, honorary degrees) are excluded both at SPARQL level and via PHP keyword filter. Also restart messenger container on cache:clear to avoid stale DI container errors. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
Makefile
1
Makefile
@@ -60,6 +60,7 @@ php\:console: ## Lance bin/console avec arguments (ex: make php:console -- cache
|
|||||||
|
|
||||||
symfony\:cache-clear: ## Vide le cache Symfony
|
symfony\:cache-clear: ## Vide le cache Symfony
|
||||||
docker compose exec app php bin/console cache:clear
|
docker compose exec app php bin/console cache:clear
|
||||||
|
docker compose restart messenger
|
||||||
|
|
||||||
test: ## Lance les tests PHPUnit
|
test: ## Lance les tests PHPUnit
|
||||||
docker compose exec app php bin/phpunit
|
docker compose exec app php bin/phpunit
|
||||||
|
|||||||
@@ -91,6 +91,18 @@ class WikidataGateway
|
|||||||
?awardStatement ps:P166 ?award .
|
?awardStatement ps:P166 ?award .
|
||||||
?awardStatement pq:P585 ?date .
|
?awardStatement pq:P585 ?date .
|
||||||
BIND(YEAR(?date) AS ?year)
|
BIND(YEAR(?date) AS ?year)
|
||||||
|
|
||||||
|
# Only keep entertainment awards (film, TV, music, theater, performing arts)
|
||||||
|
VALUES ?awardSuperclass {
|
||||||
|
wd:Q4220920 # film award
|
||||||
|
wd:Q1407443 # television award
|
||||||
|
wd:Q2235858 # music award
|
||||||
|
wd:Q15056993 # film festival award
|
||||||
|
wd:Q15383322 # theater award
|
||||||
|
wd:Q29461289 # performing arts award
|
||||||
|
}
|
||||||
|
?award wdt:P31/wdt:P279* ?awardSuperclass .
|
||||||
|
|
||||||
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en" . }
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en" . }
|
||||||
}
|
}
|
||||||
ORDER BY ?name DESC(?year)
|
ORDER BY ?name DESC(?year)
|
||||||
|
|||||||
@@ -14,6 +14,76 @@ use Psr\Log\LoggerInterface;
|
|||||||
|
|
||||||
readonly class AwardImporter
|
readonly class AwardImporter
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* Canonical award name => keywords to match (case-insensitive).
|
||||||
|
* Checked in order — first match wins.
|
||||||
|
*/
|
||||||
|
private const AWARD_MAP = [
|
||||||
|
'Oscar' => ['Academy Award', 'Oscar'],
|
||||||
|
'Golden Globe' => ['Golden Globe', 'Golden Globes'],
|
||||||
|
'BAFTA' => ['BAFTA', 'British Academy Film Award', 'British Academy Television Award', 'British Academy Games Award'],
|
||||||
|
'César' => ['César'],
|
||||||
|
'SAG' => ['Screen Actors Guild'],
|
||||||
|
'Emmy' => ['Emmy Award', 'Primetime Emmy'],
|
||||||
|
'Tony' => ['Tony Award', 'Tony award'],
|
||||||
|
'Grammy' => ['Grammy'],
|
||||||
|
'Cannes' => ['Festival de Cannes', 'Cannes', "Palme d'or", "Caméra d'or"],
|
||||||
|
'Sundance' => ['Sundance'],
|
||||||
|
'Berlinale' => ['Berlinale', 'Berliner Bär', "Ours d'argent", "Ours d'or"],
|
||||||
|
'Mostra de Venise' => ['Mostra', 'Venice Film Festival', 'Coupe Volpi', "Lion d'or"],
|
||||||
|
'Saturn' => ['Saturn Award'],
|
||||||
|
'MTV' => ['MTV Movie', 'MTV Video'],
|
||||||
|
"Critics' Choice" => ["Critics' Choice"],
|
||||||
|
'Independent Spirit' => ['Independent Spirit'],
|
||||||
|
'Annie' => ['Annie Award'],
|
||||||
|
'Goya' => ['prix Goya', 'Goya Award'],
|
||||||
|
'Laurence Olivier' => ['Laurence Olivier', 'Olivier Award'],
|
||||||
|
'David di Donatello' => ['David di Donatello'],
|
||||||
|
'Gotham' => ['Gotham Award', 'Gotham Independent'],
|
||||||
|
'NAACP Image' => ['NAACP Image'],
|
||||||
|
"People's Choice" => ["People's Choice"],
|
||||||
|
'Teen Choice' => ['Teen Choice'],
|
||||||
|
'BET' => ['BET Award', 'BET Her', 'BET YoungStars'],
|
||||||
|
'Black Reel' => ['Black Reel'],
|
||||||
|
'National Board of Review' => ['National Board of Review'],
|
||||||
|
'New York Film Critics Circle' => ['New York Film Critics Circle'],
|
||||||
|
'Los Angeles Film Critics' => ['Los Angeles Film Critics'],
|
||||||
|
'San Sebastián' => ['Donostia', 'San Sebastián'],
|
||||||
|
'Sitges' => ['Sitges'],
|
||||||
|
'Satellite' => ['Satellite Award'],
|
||||||
|
'Lucille Lortel' => ['Lucille Lortel'],
|
||||||
|
'Golden Raspberry' => ['Golden Raspberry', 'Razzie'],
|
||||||
|
'Drama Desk' => ['Drama Desk'],
|
||||||
|
'Genie' => ['Genie Award'],
|
||||||
|
'European Film Award' => ['prix du cinéma européen', 'European Film Award'],
|
||||||
|
'AACTA' => ['AACTA'],
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Keywords indicating non-entertainment awards (case-insensitive).
|
||||||
|
* These slip through even with the SPARQL filter.
|
||||||
|
*/
|
||||||
|
private const EXCLUDED_KEYWORDS = [
|
||||||
|
// National orders and decorations
|
||||||
|
'chevalier', 'officier', 'commandeur', 'compagnon',
|
||||||
|
'ordre du', 'ordre de', 'order of the',
|
||||||
|
'grand-croix', 'grand officier', 'grand cordon',
|
||||||
|
'Knight Bachelor', 'Knight Commander',
|
||||||
|
'croix d\'',
|
||||||
|
// Honorary degrees and memberships
|
||||||
|
'honoris causa',
|
||||||
|
'membre de l\'', 'membre de la', 'membre honoraire', 'membre associé', 'membre élu',
|
||||||
|
'Fellow of', 'fellow de',
|
||||||
|
// Scholarships
|
||||||
|
'bourse ',
|
||||||
|
// Medals (military, scientific, etc.)
|
||||||
|
'médaille', 'Medal',
|
||||||
|
// Other non-entertainment
|
||||||
|
'Time 100', '100 Women', 'All-NBA',
|
||||||
|
'étoile du Hollywood Walk of Fame',
|
||||||
|
'allée des célébrités',
|
||||||
|
];
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private WikidataGateway $wikidataGateway,
|
private WikidataGateway $wikidataGateway,
|
||||||
private AwardTypeRepository $awardTypeRepository,
|
private AwardTypeRepository $awardTypeRepository,
|
||||||
@@ -54,6 +124,10 @@ readonly class AwardImporter
|
|||||||
$wikidataAwards = $allAwards[$actor->getName()] ?? [];
|
$wikidataAwards = $allAwards[$actor->getName()] ?? [];
|
||||||
|
|
||||||
foreach ($wikidataAwards as $wikidataAward) {
|
foreach ($wikidataAwards as $wikidataAward) {
|
||||||
|
if ($this->isExcluded($wikidataAward['name'])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
$awardType = $this->resolveAwardType($wikidataAward['name'], $knownTypes);
|
$awardType = $this->resolveAwardType($wikidataAward['name'], $knownTypes);
|
||||||
|
|
||||||
$award = new Award();
|
$award = new Award();
|
||||||
@@ -74,12 +148,33 @@ readonly class AwardImporter
|
|||||||
*/
|
*/
|
||||||
private function resolveAwardType(string $awardName, array &$knownTypes): AwardType
|
private function resolveAwardType(string $awardName, array &$knownTypes): AwardType
|
||||||
{
|
{
|
||||||
|
// 1. Try canonical map first
|
||||||
|
$canonicalName = $this->findCanonicalName($awardName);
|
||||||
|
|
||||||
|
if (null !== $canonicalName) {
|
||||||
|
foreach ($knownTypes as $type) {
|
||||||
|
if ($type->getName() === $canonicalName) {
|
||||||
|
return $type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$newType = new AwardType();
|
||||||
|
$newType->setName($canonicalName);
|
||||||
|
$newType->setPattern($canonicalName);
|
||||||
|
$this->em->persist($newType);
|
||||||
|
$knownTypes[] = $newType;
|
||||||
|
|
||||||
|
return $newType;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Fall back to existing pattern matching
|
||||||
foreach ($knownTypes as $type) {
|
foreach ($knownTypes as $type) {
|
||||||
if (str_contains($awardName, $type->getPattern())) {
|
if (str_contains($awardName, $type->getPattern())) {
|
||||||
return $type;
|
return $type;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 3. Create new type with prefix extraction
|
||||||
$newType = new AwardType();
|
$newType = new AwardType();
|
||||||
$prefix = $this->extractPrefix($awardName);
|
$prefix = $this->extractPrefix($awardName);
|
||||||
$newType->setName($prefix);
|
$newType->setName($prefix);
|
||||||
@@ -91,10 +186,43 @@ readonly class AwardImporter
|
|||||||
return $newType;
|
return $newType;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function findCanonicalName(string $awardName): ?string
|
||||||
|
{
|
||||||
|
$normalized = mb_strtolower($awardName);
|
||||||
|
|
||||||
|
foreach (self::AWARD_MAP as $canonical => $keywords) {
|
||||||
|
foreach ($keywords as $keyword) {
|
||||||
|
if (str_contains($normalized, mb_strtolower($keyword))) {
|
||||||
|
return $canonical;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isExcluded(string $awardName): bool
|
||||||
|
{
|
||||||
|
$normalized = mb_strtolower($awardName);
|
||||||
|
|
||||||
|
foreach (self::EXCLUDED_KEYWORDS as $keyword) {
|
||||||
|
if (str_contains($normalized, mb_strtolower($keyword))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private function extractPrefix(string $awardName): string
|
private function extractPrefix(string $awardName): string
|
||||||
{
|
{
|
||||||
// Extract text before " for " or " pour " (common patterns in award names)
|
// "X for Y", "X pour Y", "X du Y", "X de la Y", "X de l'Y", "X des Y"
|
||||||
if (preg_match('/^(.+?)\s+(?:for|pour)\s+/i', $awardName, $matches)) {
|
if (preg_match('/^(.+?)\s+(?:for|pour|du|de la|de l\'|des)\s+/iu', $awardName, $matches)) {
|
||||||
|
return trim($matches[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// "... festival de cinéma de X" or "... festival de X"
|
||||||
|
if (preg_match('/festival\s+(?:de\s+(?:cinéma\s+de\s+)?)?(.+?)$/iu', $awardName, $matches)) {
|
||||||
return trim($matches[1]);
|
return trim($matches[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class AwardImporterTest extends TestCase
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
$existingType = new AwardType();
|
$existingType = new AwardType();
|
||||||
$existingType->setName('Oscar')->setPattern('Academy Award');
|
$existingType->setName('Oscar')->setPattern('Oscar');
|
||||||
|
|
||||||
$this->awardTypeRepository->method('findAll')->willReturn([$existingType]);
|
$this->awardTypeRepository->method('findAll')->willReturn([$existingType]);
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ class AwardImporterTest extends TestCase
|
|||||||
$this->assertSame($actor, $persisted[0]->getActor());
|
$this->assertSame($actor, $persisted[0]->getActor());
|
||||||
}
|
}
|
||||||
|
|
||||||
public function testCreatesNewAwardTypeWhenNoPatternMatches(): void
|
public function testCanonicalMapGroupsRelatedAwards(): void
|
||||||
{
|
{
|
||||||
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
||||||
|
|
||||||
@@ -98,14 +98,67 @@ class AwardImporterTest extends TestCase
|
|||||||
|
|
||||||
$newType = $persisted[0];
|
$newType = $persisted[0];
|
||||||
$this->assertInstanceOf(AwardType::class, $newType);
|
$this->assertInstanceOf(AwardType::class, $newType);
|
||||||
$this->assertSame('Screen Actors Guild Award', $newType->getName());
|
$this->assertSame('SAG', $newType->getName());
|
||||||
$this->assertSame('Screen Actors Guild Award', $newType->getPattern());
|
$this->assertSame('SAG', $newType->getPattern());
|
||||||
|
|
||||||
$award = $persisted[1];
|
$award = $persisted[1];
|
||||||
$this->assertInstanceOf(Award::class, $award);
|
$this->assertInstanceOf(Award::class, $award);
|
||||||
$this->assertSame($newType, $award->getAwardType());
|
$this->assertSame($newType, $award->getAwardType());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testFallsBackToExtractPrefixWhenNotInCanonicalMap(): void
|
||||||
|
{
|
||||||
|
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
||||||
|
|
||||||
|
$this->wikidataGateway->method('getAwardsForActors')->willReturn([
|
||||||
|
'Test Actor' => [
|
||||||
|
['name' => 'Bambi for Best Film', 'year' => 2019],
|
||||||
|
],
|
||||||
|
]);
|
||||||
|
|
||||||
|
$this->awardTypeRepository->method('findAll')->willReturn([]);
|
||||||
|
|
||||||
|
$persisted = [];
|
||||||
|
$this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) {
|
||||||
|
$persisted[] = $entity;
|
||||||
|
});
|
||||||
|
|
||||||
|
$this->importer->importForActors([$actor]);
|
||||||
|
|
||||||
|
$newType = $persisted[0];
|
||||||
|
$this->assertInstanceOf(AwardType::class, $newType);
|
||||||
|
$this->assertSame('Bambi', $newType->getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testExcludesNonEntertainmentAwards(): void
|
||||||
|
{
|
||||||
|
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
||||||
|
|
||||||
|
$this->wikidataGateway->method('getAwardsForActors')->willReturn([
|
||||||
|
'Test Actor' => [
|
||||||
|
['name' => 'chevalier de la Légion d\'honneur', 'year' => 2015],
|
||||||
|
['name' => 'docteur honoris causa', 'year' => 2018],
|
||||||
|
['name' => 'bourse Rhodes', 'year' => 2010],
|
||||||
|
['name' => 'Oscar du meilleur acteur', 'year' => 2020],
|
||||||
|
],
|
||||||
|
]);
|
||||||
|
|
||||||
|
$this->awardTypeRepository->method('findAll')->willReturn([]);
|
||||||
|
|
||||||
|
$persisted = [];
|
||||||
|
$this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) {
|
||||||
|
$persisted[] = $entity;
|
||||||
|
});
|
||||||
|
|
||||||
|
$this->importer->importForActors([$actor]);
|
||||||
|
|
||||||
|
// Only the Oscar should be persisted (1 AwardType + 1 Award)
|
||||||
|
$this->assertCount(2, $persisted);
|
||||||
|
$this->assertInstanceOf(AwardType::class, $persisted[0]);
|
||||||
|
$this->assertSame('Oscar', $persisted[0]->getName());
|
||||||
|
$this->assertInstanceOf(Award::class, $persisted[1]);
|
||||||
|
}
|
||||||
|
|
||||||
public function testDoesNotSetFlagOnWikidataError(): void
|
public function testDoesNotSetFlagOnWikidataError(): void
|
||||||
{
|
{
|
||||||
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
||||||
@@ -146,7 +199,7 @@ class AwardImporterTest extends TestCase
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
$existingType = new AwardType();
|
$existingType = new AwardType();
|
||||||
$existingType->setName('Oscar')->setPattern('Academy Award');
|
$existingType->setName('Oscar')->setPattern('Oscar');
|
||||||
|
|
||||||
$this->awardTypeRepository->method('findAll')->willReturn([$existingType]);
|
$this->awardTypeRepository->method('findAll')->willReturn([$existingType]);
|
||||||
|
|
||||||
@@ -163,6 +216,60 @@ class AwardImporterTest extends TestCase
|
|||||||
$this->assertCount(3, $persisted);
|
$this->assertCount(3, $persisted);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testExtractPrefixHandlesFrenchPatterns(): void
|
||||||
|
{
|
||||||
|
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
||||||
|
|
||||||
|
$this->wikidataGateway->method('getAwardsForActors')->willReturn([
|
||||||
|
'Test Actor' => [
|
||||||
|
['name' => 'Bodil du meilleur acteur', 'year' => 2019],
|
||||||
|
],
|
||||||
|
]);
|
||||||
|
|
||||||
|
$this->awardTypeRepository->method('findAll')->willReturn([]);
|
||||||
|
|
||||||
|
$persisted = [];
|
||||||
|
$this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) {
|
||||||
|
$persisted[] = $entity;
|
||||||
|
});
|
||||||
|
|
||||||
|
$this->importer->importForActors([$actor]);
|
||||||
|
|
||||||
|
$newType = $persisted[0];
|
||||||
|
$this->assertInstanceOf(AwardType::class, $newType);
|
||||||
|
$this->assertSame('Bodil', $newType->getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testCanonicalMapReusesExistingType(): void
|
||||||
|
{
|
||||||
|
$actor = $this->createActorWithFlag('Test Actor', awardsImported: false);
|
||||||
|
|
||||||
|
$this->wikidataGateway->method('getAwardsForActors')->willReturn([
|
||||||
|
'Test Actor' => [
|
||||||
|
['name' => 'oscar du meilleur acteur', 'year' => 2020],
|
||||||
|
['name' => 'Oscar de la meilleure actrice', 'year' => 2021],
|
||||||
|
],
|
||||||
|
]);
|
||||||
|
|
||||||
|
$existingOscar = new AwardType();
|
||||||
|
$existingOscar->setName('Oscar')->setPattern('Oscar');
|
||||||
|
|
||||||
|
$this->awardTypeRepository->method('findAll')->willReturn([$existingOscar]);
|
||||||
|
|
||||||
|
$persisted = [];
|
||||||
|
$this->em->method('persist')->willReturnCallback(function ($entity) use (&$persisted) {
|
||||||
|
$persisted[] = $entity;
|
||||||
|
});
|
||||||
|
|
||||||
|
$this->importer->importForActors([$actor]);
|
||||||
|
|
||||||
|
// Both awards should reuse the same "Oscar" type — only 2 Awards persisted, no new AwardType
|
||||||
|
$this->assertCount(2, $persisted);
|
||||||
|
$this->assertContainsOnlyInstancesOf(Award::class, $persisted);
|
||||||
|
$this->assertSame($existingOscar, $persisted[0]->getAwardType());
|
||||||
|
$this->assertSame($existingOscar, $persisted[1]->getAwardType());
|
||||||
|
}
|
||||||
|
|
||||||
private function createActorWithFlag(string $name, bool $awardsImported): Actor
|
private function createActorWithFlag(string $name, bool $awardsImported): Actor
|
||||||
{
|
{
|
||||||
$actor = new Actor();
|
$actor = new Actor();
|
||||||
|
|||||||
Reference in New Issue
Block a user