{
  "apiVersion": "v1",
  "methodology": "https://sourcescore.org/methodology/",
  "canonical": "https://sourcescore.org/claims/4aef1422b96df26c/",
  "claim": {
    "vertical": "ai-ml",
    "subject": "The Pile dataset",
    "predicate": "released_on",
    "object": "2020-12-31",
    "confidence": 1,
    "sources": [
      {
        "url": "https://arxiv.org/abs/2101.00027",
        "title": "The Pile: An 800GB Dataset of Diverse Text for Language Modeling",
        "publisher": "arXiv (Gao, Biderman, Black, Golding, Hoppe, Foster, Phang, He, Thite, Nabeshima, Presser, Leahy)",
        "publishedDate": "2020-12-31",
        "accessedDate": "2026-05-16",
        "type": "preprint",
        "excerpt": "In this work, we present the Pile: an 825 GiB English text corpus targeted at training large-scale language models."
      },
      {
        "url": "https://pile.eleuther.ai/",
        "title": "The Pile — official site",
        "publisher": "EleutherAI",
        "accessedDate": "2026-05-16",
        "type": "official-blog"
      }
    ],
    "publishedAt": "2026-05-16T00:00:00Z",
    "lastVerified": "2026-05-16",
    "methodologyVersion": "veritas-v0.1",
    "tags": [
      "the-pile",
      "dataset",
      "pretraining",
      "eleutherai",
      "2020"
    ],
    "id": "4aef1422b96df26c",
    "statement": "The Pile dataset released on: 2020-12-31."
  },
  "signature": {
    "algorithm": "HMAC-SHA256",
    "signedBy": "did:web:sourcescore.org",
    "signedAt": "2026-05-16T00:00:00.000Z",
    "signature": "d815e8d4cc75b99818047c644b9404955e2616a5536e7ea453704dd15e16fd55"
  },
  "citedAs": "The Pile dataset released on: 2020-12-31. — SourceScore Claim 4aef1422b96df26c (verified 2026-05-16, signed d815e8d4…). https://sourcescore.org/claims/4aef1422b96df26c/"
}