Fix: CSV importer and imported data

- pad rows, distinguish empty year, better error diagnostics
- derive year from identifier when year column is empty
- fix remaining 18 theses: Installation/Performance (slash→dash) orientation alias
- csv importer: use column-name-based header detection instead of hardcoded positions
This commit is contained in:
Pontoporeia
2026-05-05 18:54:54 +02:00
parent b063312642
commit 3f87d71e38
5 changed files with 218 additions and 32 deletions

View File

@@ -75,10 +75,14 @@
- [x] `templates/admin/edit.php` — moved `.admin-form-footer` from bottom to top-right, right after `<h1>`
- [x] `admin.css` — added `.admin-form-footer--sticky` variant with `position:sticky; top:0; justify-content:flex-end`
## Fix CSV importer robustness
## Fix CSV importer column shift and data repair
- [x] Pad rows to expected column count to avoid offset warnings from short rows
- [x] Distinguish `$yearRaw !== ''` before `intval()` to handle empty-year rows correctly
- [x] Improve missing-field error message: lists which fields are missing, includes identifier/title snippet
- [x] Derive year from identifier when year column is empty
- [x] Auto-detect column-shifted CSV: when orientation/finality columns are empty but synopsis/context match known orientation/finality names, remap on import
- [x] Migration `013_fix_csv_column_shift.sql`: move orientation from synopsis→orientation_id, finality from context_note→finality_id for already-imported theses
- [x] Migration `013_fix_remarks_keywords.php`: move keywords from remarks→tags+thesis_tags for already-imported theses
## Standardise répertoire filter column rendering
- [x] Centralise filter column rendering into a shared `repFilterEntry()` function

View File

@@ -0,0 +1,30 @@
-- Fix theses that were imported with column-shifted CSV data.
-- Orientation names ended up in synopsis, finality names in context_note,
-- and keywords in remarks. Move them to the correct FK columns.
-- 1. Fix orientation_id from synopsis
UPDATE theses SET
orientation_id = (SELECT o.id FROM orientations o WHERE LOWER(o.name) = LOWER(theses.synopsis)),
synopsis = NULL
WHERE orientation_id IS NULL
AND synopsis IS NOT NULL
AND synopsis != ''
AND EXISTS (SELECT 1 FROM orientations o WHERE LOWER(o.name) = LOWER(theses.synopsis));
-- 2. Fix finality_id from context_note
UPDATE theses SET
finality_id = (SELECT ft.id FROM finality_types ft WHERE LOWER(ft.name) = LOWER(theses.context_note)),
context_note = NULL
WHERE finality_id IS NULL
AND context_note IS NOT NULL
AND context_note != ''
AND EXISTS (SELECT 1 FROM finality_types ft WHERE LOWER(ft.name) = LOWER(theses.context_note));
-- 3. Fix AP program from synopsis (if any synopsis values match AP names — edge case)
UPDATE theses SET
ap_program_id = (SELECT ap.id FROM ap_programs ap WHERE LOWER(ap.name) = LOWER(theses.synopsis)),
synopsis = NULL
WHERE ap_program_id IS NULL
AND synopsis IS NOT NULL
AND synopsis != ''
AND EXISTS (SELECT 1 FROM ap_programs ap WHERE LOWER(ap.name) = LOWER(theses.synopsis));

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env php
<?php
/**
* Migrate keywords from theses.remarks → tags + thesis_tags.
*
* The CSV importer (before the column-shift fix) stored comma-separated
* keywords in the `remarks` column. This script extracts them and creates
* proper tag rows, then clears the remarks column.
*
* Run: php migrations/pending/013_fix_remarks_keywords.php
*/
require_once __DIR__ . '/../../src/Database.php';
$db = Database::getInstance();
$pdo = $db->getPDO();
// Fetch theses with non-empty remarks
$rows = $pdo->query(
"SELECT id, remarks FROM theses WHERE remarks IS NOT NULL AND remarks != ''"
)->fetchAll();
$insertTag = $pdo->prepare('INSERT OR IGNORE INTO tags (name) VALUES (?)');
$getTagId = $pdo->prepare('SELECT id FROM tags WHERE name = ?');
$insertLink = $pdo->prepare('INSERT OR IGNORE INTO thesis_tags (thesis_id, tag_id) VALUES (?, ?)');
$clearRemarks = $pdo->prepare('UPDATE theses SET remarks = NULL WHERE id = ?');
$pdo->beginTransaction();
try {
$migrated = 0;
foreach ($rows as $row) {
$thesisId = (int)$row['id'];
$raw = trim($row['remarks']);
if ($raw === '') {
$clearRemarks->execute([$thesisId]);
continue;
}
$keywords = array_map('trim', explode(',', $raw));
foreach ($keywords as $kw) {
$kw = trim($kw);
if ($kw === '' || mb_strlen($kw) > 100) continue;
// Create tag if needed
$insertTag->execute([$kw]);
$getTagId->execute([$kw]);
$tagId = $getTagId->fetchColumn();
if ($tagId) {
$insertLink->execute([$thesisId, (int)$tagId]);
}
}
$clearRemarks->execute([$thesisId]);
$migrated++;
}
$pdo->commit();
echo "Done. Migrated keywords for $migrated theses.\n";
} catch (Throwable $e) {
$pdo->rollBack();
echo "Error: " . $e->getMessage() . "\n";
exit(1);
}

View File

@@ -33,10 +33,87 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_FILES['csv_file'])) {
$handle = fopen($_FILES['csv_file']['tmp_name'], 'r');
if (!$handle) throw new Exception("Impossible d'ouvrir le fichier CSV.");
fgetcsv($handle, 0, ',', '"', '');
fgetcsv($handle, 0, ',', '"', '');
fgetcsv($handle, 0, ',', '"', '');
fgetcsv($handle, 0, ',', '"', ''); // skip 4 header rows
// Scan up to 8 rows looking for a header row with known column names.
// Build colIdx[name] → position map; fall back to positional if header not found.
// Matching uses prefix + variant logic so "contact.visible" matches "contact",
// "promoteur·ice(s)" matches "promoteur", "Licence" matches "license", etc.
$colIdx = null;
$headerRowNum = 0;
$knownHeaders = [
'identifiant', 'titre', 'sous-titre', 'auteur', 'contact',
'promoteur', 'format', 'année', 'ap', 'orientation', 'finalité',
'mots-clés', 'synopsis', 'contexte', 'remarques', 'langue',
'autorisation', 'licence', 'license', 'taille', 'points', 'lien baiu',
];
for ($scan = 0; $scan < 8; $scan++) {
$hrow = fgetcsv($handle, 0, ',', '\"', '');
if ($hrow === false) break;
$headerRowNum++;
$normRow = array_map(fn($s) => strtolower(trim((string)$s)), $hrow);
$hits = 0;
$map = [];
$used = [];
foreach ($knownHeaders as $h) {
foreach ($normRow as $pos => $cell) {
if (isset($used[$pos])) continue;
// Exact match
if ($cell === $h) { $hits++; $map[$h] = $pos; $used[$pos] = true; break; }
// Licence/License cross-match
if (($h === 'licence' && $cell === 'license') || ($h === 'license' && $cell === 'licence'))
{ $hits++; $map[$h] = $pos; $used[$pos] = true; break; }
// Prefix match (for compound headers like "contact.visible")
$hlen = strlen($h);
if ($hlen >= 4 && str_starts_with($cell, $h)) {
// Avoid short prefixes matching unrelated words
if ($hlen >= 5 || $cell === $h) { $hits++; $map[$h] = $pos; $used[$pos] = true; break; }
}
}
}
// Require at least 8 known headers to trust the row.
if ($hits >= 8) { $colIdx = $map; break; }
}
// If no header row found, rewind and fall back to positional (skip 4 rows).
if ($colIdx === null) {
rewind($handle);
$headerRowNum = 4;
for ($i = 0; $i < 4; $i++) fgetcsv($handle);
} else {
// Consume blank/instruction/template rows between header and data.
// Stops when a row has a non-empty identifiant column that is not a
// template placeholder (e.g. "Column1") or instruction snippet.
$idPos = $colIdx['identifiant'] ?? 0;
$peekRow = null;
while (true) {
$peek = fgetcsv($handle, 0, ',', '\"', '');
if ($peek === false) break;
$headerRowNum++;
$val = trim((string)($peek[$idPos] ?? ''));
if ($val === '' || str_starts_with(strtolower($val), 'column')
|| str_contains(strtolower($val), 'éparer')) {
continue; // metadata row, skip
}
$peekRow = $peek;
break;
}
}
// Helper: get cell value by column name.
// When header was found: only use mapped column (returns '' if missing from header).
// When no header found: use positional fallback index.
$cell = function(array $row, string $name, int $fallbackPos) use ($colIdx): string {
if ($colIdx !== null) {
$pos = $colIdx[$name] ?? null;
if ($pos === null) {
// Try licence/license cross-lookup
if ($name === 'license') $pos = $colIdx['licence'] ?? null;
elseif ($name === 'licence') $pos = $colIdx['license'] ?? null;
}
if ($pos === null) return '';
} else {
$pos = $fallbackPos;
}
return isset($row[$pos]) ? trim((string)$row[$pos]) : '';
};
// Code → canonical name (legacy short-code CSV format)
$orientationCodeMap = [
@@ -132,39 +209,48 @@ if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_FILES['csv_file'])) {
return $r ? (int)$r['id'] : null;
};
$lineNumber = 5;
while (($row = fgetcsv($handle, 0, ',', '"', '')) !== false) {
$lineNumber = $headerRowNum;
$usePeek = isset($peekRow) && $peekRow !== null;
while (true) {
if ($usePeek) {
$row = $peekRow;
$usePeek = false;
} else {
$row = fgetcsv($handle, 0, ',', '\"', '');
if ($row === false) break;
}
$lineNumber++;
if (empty($row[0]) && empty($row[1])) continue;
try {
$importDb->beginTransaction();
// Pad row to expected column count to avoid offset warnings.
$expectedCols = 21;
while (count($row) < $expectedCols) $row[] = '';
$identifier = trim($row[0]);
$title = trim($row[1]);
$subtitle = trim($row[2]);
$authorsRaw = trim($row[3]);
$contact = trim($row[4]);
$supervisorsRaw = trim($row[5]);
$formatsRaw = trim($row[6]);
$yearRaw = trim($row[7]);
$identifier = $cell($row, 'identifiant', 0);
$title = $cell($row, 'titre', 1);
$subtitle = $cell($row, 'sous-titre', 2);
$authorsRaw = $cell($row, 'auteur', 3);
$contact = $cell($row, 'contact', 4);
$supervisorsRaw = $cell($row, 'promoteur', 5);
$formatsRaw = $cell($row, 'format', 6);
$yearRaw = $cell($row, 'année', 7);
$year = $yearRaw !== '' ? intval($yearRaw) : 0;
$apCode = trim($row[8]);
$orientationCode = trim($row[9]);
$finalityName = trim($row[10]);
$keywordsRaw = trim($row[11]);
$synopsis = trim($row[12]);
$context = trim($row[13]);
$remarks = trim($row[14]);
$languageRaw = trim($row[15]);
$access = trim($row[16]);
$license = trim($row[17]);
$sizeInfo = trim($row[18]);
$juryPoints = !empty($row[19]) ? floatval($row[19]) : null;
$baiuLink = trim($row[20]);
// Fallback: derive year from identifier (e.g. "2024-003" → 2024)
if ($year === 0 && $identifier !== '' && preg_match('/^(\d{4})-/', $identifier, $m)) {
$year = (int)$m[1];
}
$apCode = $cell($row, 'ap', 8);
$orientationCode = $cell($row, 'orientation', 9);
$finalityName = $cell($row, 'finalité', 10);
$keywordsRaw = $cell($row, 'mots-clés', 11);
$synopsis = $cell($row, 'synopsis', 12);
$context = $cell($row, 'contexte', 13);
$remarks = $cell($row, 'remarques', 14);
$languageRaw = $cell($row, 'langue', 15);
$access = $cell($row, 'autorisation', 16);
$license = $cell($row, 'license', 17);
$sizeInfo = $cell($row, 'taille', 18);
$juryPointsRaw = $cell($row, 'points', 19);
$juryPoints = $juryPointsRaw !== '' ? floatval($juryPointsRaw) : null;
$baiuLink = $cell($row, 'lien baiu', 20);
if ($title === '' || $year === 0) {
$missing = [];

View File

@@ -8,3 +8,5 @@
{"timestamp":"2026-05-05T09:33:13+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"thesis","action":"csv_export","status":"success"}
{"timestamp":"2026-05-05T09:33:44+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"settings","action":"formulaire_update","status":"success","context":{"values":{"access_type_libre_enabled":"0","access_type_interne_enabled":"1","access_type_interdit_enabled":"1","restricted_files_enabled":"1"}}}
{"timestamp":"2026-05-05T16:40:13+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"system","action":"delete_all_theses","status":"success","context":{"count":13}}
{"timestamp":"2026-05-05T16:57:57+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"thesis","action":"publish","status":"success","context":{"count":15,"ids":[53,52,51,50,49,48,47,46,45,44,43,42,41,40,39]}}
{"timestamp":"2026-05-05T16:58:02+00:00","ip":"127.0.0.1","user_agent":"Mozilla/5.0 (X11; Linux x86_64; rv:150.0) Gecko/20100101 Firefox/150.0","resource":"thesis","action":"publish","status":"success","context":{"count":25,"ids":[178,177,176,175,174,173,172,171,170,169,168,167,166,165,164,163,162,161,160,159,158,157,156,155,154]}}