From 4feef6f3437619d3bfe622a93eb4e30ed05c885f Mon Sep 17 00:00:00 2001 From: Jochen Voss Date: Sat, 1 Feb 2025 19:55:35 +0000 Subject: [PATCH] updates --- docs/content/pdf/text/fonts.md | 2 +- font/cff/cid.go | 2 +- font/encoding/enc2.go | 354 --------------- font/encoding/enc2_test.go | 242 ----------- font/encoding/encoder.go | 2 +- font/encoding/{encoding.go => old.go} | 38 +- .../{encoding_test.go => old_test.go} | 4 +- font/encoding/truetype.go | 150 +++++++ font/encoding/type1.go | 409 +++++++++++++----- font/encoding/type1_test.go | 254 +++++++++-- font/opentype/cidcff.go | 286 +++--------- font/opentype/cidcff_test.go | 97 ----- font/opentype/font.go | 10 +- font/opentype/simplecff.go | 17 +- font/opentype/simpleglyf.go | 114 ++++- font/simple/type1.go | 2 +- font/truetype/font.go | 8 +- font/truetype/simple.go | 173 ++++---- font/type3/font.go | 2 +- reader/font.go | 4 +- 20 files changed, 948 insertions(+), 1222 deletions(-) delete mode 100644 font/encoding/enc2.go delete mode 100644 font/encoding/enc2_test.go rename font/encoding/{encoding.go => old.go} (93%) rename font/encoding/{encoding_test.go => old_test.go} (99%) create mode 100644 font/encoding/truetype.go diff --git a/docs/content/pdf/text/fonts.md b/docs/content/pdf/text/fonts.md index 4c3232f6..87d86d97 100644 --- a/docs/content/pdf/text/fonts.md +++ b/docs/content/pdf/text/fonts.md @@ -183,7 +183,7 @@ The following information applies to simple PDF fonts. | | non-symbolic | symbolic | | ----------: | :----------: | :--------: | | encoding | 4 | 2 | - | no encoding | 1 | avoid | + | no encoding | avoid | 1 | Reading: I plan to try the methods in the following order and to use the first one which succeeds: diff --git a/font/cff/cid.go b/font/cff/cid.go index b331431e..531457e6 100644 --- a/font/cff/cid.go +++ b/font/cff/cid.go @@ -134,7 +134,7 @@ func (f *embeddedComposite) Finish(rm *pdf.ResourceManager) error { } dw := subsetCFF.GlyphWidthPDF(0) - isSymbolic := false + isSymbolic := false // TODO(voss): set this correctly qh := subsetCFF.FontMatrix[0] * 1000 qv := subsetCFF.FontMatrix[3] * 1000 diff --git a/font/encoding/enc2.go b/font/encoding/enc2.go deleted file mode 100644 index 6b8e0567..00000000 --- a/font/encoding/enc2.go +++ /dev/null @@ -1,354 +0,0 @@ -// seehuhn.de/go/pdf - a library for reading and writing PDF files -// Copyright (C) 2024 Jochen Voss -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . - -package encoding - -import ( - "errors" - - "seehuhn.de/go/pdf" - "seehuhn.de/go/pdf/font/pdfenc" -) - -// Type1 gives the glyph name for each code point. -// The empty string indicates unused codes. -// The special value [UseBuiltin] indicates that the corresponding glyph from -// the built-in encoding should be used. -type Type1 func(code byte) string - -const UseBuiltin = "@" - -var ( - Builtin Type1 = func(code byte) string { - return UseBuiltin - } - WinAnsi Type1 = func(code byte) string { - return pdfenc.WinAnsi.Encoding[code] - } - MacRoman Type1 = func(code byte) string { - return pdfenc.MacRoman.Encoding[code] - } - MacExpert Type1 = func(code byte) string { - return pdfenc.MacExpert.Encoding[code] - } - Standard Type1 = func(code byte) string { - return pdfenc.Standard.Encoding[code] - } -) - -// ExtractType1 extracts the encoding from the /Encoding entry of a Type1 -// font dictionary. -// -// If the argument nonSymbolicExt is true, the function assumes that the font -// has the non-symbolic flag set in the font descriptor and that the font is -// not embedded in the PDF file. -// -// If /Encoding is malformed, the font's built-in encoding is used as a -// fallback. -func ExtractType1(r pdf.Getter, obj pdf.Object, nonSymbolicExt bool) (Type1, error) { - obj, err := pdf.Resolve(r, obj) - if err != nil { - return nil, err - } - - if name, ok := obj.(pdf.Name); ok { - switch name { - case "WinAnsiEncoding": - return WinAnsi, nil - case "MacRomanEncoding": - return MacRoman, nil - case "MacExpertEncoding": - return MacExpert, nil - } - } - - dict, _ := obj.(pdf.Dict) - if dict == nil { - return Builtin, nil - } - if err := pdf.CheckDictType(r, dict, "Encoding"); err != nil { - return Builtin, err - } - - // If we reach this point, we have found an encoding dictionary. - - var baseEnc Type1 - baseEncName, _ := pdf.GetName(r, dict["BaseEncoding"]) - switch baseEncName { - case "WinAnsiEncoding": - baseEnc = WinAnsi - case "MacRomanEncoding": - baseEnc = MacRoman - case "MacExpertEncoding": - baseEnc = MacExpert - default: - if nonSymbolicExt { // non-symbolic and not embedded - baseEnc = Standard - } else { // symbolic or embedded - baseEnc = Builtin - } - } - - differences := make(map[byte]string) - if diffArray, _ := pdf.GetArray(r, dict["Differences"]); diffArray != nil { - currentCode := pdf.Integer(-1) - for _, item := range diffArray { - item, err = pdf.Resolve(r, item) - if err != nil { - return nil, err - } - - switch item := item.(type) { - case pdf.Integer: - currentCode = item - - case pdf.Name: - if currentCode >= 0 && currentCode < 256 { - differences[byte(currentCode)] = string(item) - currentCode++ - } - } - } - } - if len(differences) == 0 { - return baseEnc, nil - } - - return func(code byte) string { - if glyphName, ok := differences[code]; ok { - return glyphName - } - return baseEnc(code) - }, nil -} - -// AsPDFType1 returns the /Encoding entry for Type1 font dictionary. -// -// If the argument baseIsStd is true, Differences arrays record changes from -// the standard encoding. Otherwise, Differences arrays record changes from the -// built-in encoding. The flag baseIsStd should be set if the non-symbolic flag -// set in the font descriptor and that the font is not be embedded in the PDF -// file. If the flag is set, the built-in encoding must either be used for all -// mapped codes, or not at all. -// -// The resulting PDF object describes an encoding which maps all characters -// mapped by e to the given glyph name, but it may also imply glyph names for -// the unmapped codes. -func (e Type1) AsPDFType1(baseIsStd bool, opt pdf.OutputOptions) (pdf.Object, error) { - type candInfo struct { - encName pdf.Native - enc []string - differences pdf.Array - } - - // First check whether we can use the built-in encoding. - canUseBuiltin := true - for code := range 256 { - if e(byte(code)) != "" && e(byte(code)) != UseBuiltin { - canUseBuiltin = false - break - } - } - if canUseBuiltin { - return nil, nil - } - - // Next, if no codes are mapped to the built-in encoding, we may be able to - // use a named encoding. - noBuiltin := true - for code := range 256 { - if e(byte(code)) == UseBuiltin { - noBuiltin = false - break - } - } - if noBuiltin { - candidates := []*candInfo{ - {encName: pdf.Name("WinAnsiEncoding"), enc: pdfenc.WinAnsi.Encoding[:]}, - {encName: pdf.Name("MacRomanEncoding"), enc: pdfenc.MacRoman.Encoding[:]}, - {encName: pdf.Name("MacExpertEncoding"), enc: pdfenc.MacExpert.Encoding[:]}, - } - candidateLoop: - for _, cand := range candidates { - for code := range 256 { - if glyphName := e(byte(code)); glyphName != "" && glyphName != cand.enc[code] { - // we got a conflict, try the next candidate - continue candidateLoop - } - } - return cand.encName, nil - } - } - - // If we reach this point, we need an encoding dictionary. We choose the - // base encoding which leads to the smallest Differences array. - - var candidates []*candInfo - if noBuiltin { - candidates = []*candInfo{ - {encName: pdf.Name("WinAnsiEncoding"), enc: pdfenc.WinAnsi.Encoding[:]}, - {encName: pdf.Name("MacRomanEncoding"), enc: pdfenc.MacRoman.Encoding[:]}, - {encName: pdf.Name("MacExpertEncoding"), enc: pdfenc.MacExpert.Encoding[:]}, - } - if baseIsStd { - // If a font is marked as non-symbolic in the font descriptor and - // the font is not embedded, a missing `BaseEncoding` field - // represents the standard encoding. - candidates = append(candidates, - &candInfo{encName: nil, enc: pdfenc.Standard.Encoding[:]}, - ) - } - for _, cand := range candidates { - lastDiff := 999 - for code := range 256 { - glyphName := e(byte(code)) - if glyphName == "" || glyphName == cand.enc[code] { - continue - } - - if code != lastDiff+1 { - cand.differences = append(cand.differences, pdf.Integer(code)) - } - cand.differences = append(cand.differences, pdf.Name(glyphName)) - lastDiff = code - } - } - } else { - if baseIsStd { - // If the font is marked as non-symbolic in the font descriptor and - // the font is not embedded, a missing `BaseEncoding` field - // represents the standard encoding. In this case, there is no way - // to refer to the built-in encoding. - return nil, errInvalidEncoding - } - - var diff pdf.Array - lastDiff := 999 - for code := range 256 { - glyphName := e(byte(code)) - if glyphName == "" || glyphName == UseBuiltin { - continue - } - - if code != lastDiff+1 { - diff = append(diff, pdf.Integer(code)) - } - diff = append(diff, pdf.Name(glyphName)) - lastDiff = code - } - - candidates = append(candidates, &candInfo{ - encName: nil, - differences: diff, - }) - } - - // candidates is non-empty at this point - - var bestDict pdf.Dict - bestDiffLength := 999 - for _, cand := range candidates { - if L := len(cand.differences); L < bestDiffLength { - bestDiffLength = L - bestDict = pdf.Dict{} - if cand.encName != nil { - bestDict["BaseEncoding"] = cand.encName - } - if L > 0 { - bestDict["Differences"] = cand.differences - } - } - } - if opt.HasAny(pdf.OptDictTypes) { - bestDict["Type"] = pdf.Name("Encoding") - } - return bestDict, nil -} - -// ExtractType3 extracts the encoding from the /Encoding entry of a Type3 -// font dictionary. -func ExtractType3(r pdf.Getter, obj pdf.Object) (Type1, error) { - dict, err := pdf.GetDictTyped(r, obj, "Encoding") - if err != nil { - return nil, err - } - - diffArray, err := pdf.GetArray(r, dict["Differences"]) - if err != nil { - return nil, err - } - - differences := make(map[byte]string) - - currentCode := pdf.Integer(-1) - for _, item := range diffArray { - item, err = pdf.Resolve(r, item) - if err != nil { - return nil, err - } - - switch item := item.(type) { - case pdf.Integer: - currentCode = item - - case pdf.Name: - if currentCode >= 0 && currentCode < 256 { - differences[byte(currentCode)] = string(item) - currentCode++ - } - } - } - - if len(differences) == 0 { - return nil, &pdf.MalformedFileError{ - Err: errors.New("missing /Differences array"), - } - } - - return func(code byte) string { - return differences[code] - }, nil -} - -// AsPDFType3 returns the /Encoding entry for Type3 font dictionary. -func (e Type1) AsPDFType3(opt pdf.OutputOptions) (pdf.Object, error) { - var differences pdf.Array - - lastDiff := 999 - for code := range 256 { - glyphName := e(byte(code)) - if glyphName == "" { - continue - } - - if code != lastDiff+1 { - differences = append(differences, pdf.Integer(code)) - } - differences = append(differences, pdf.Name(glyphName)) - lastDiff = code - } - - dict := pdf.Dict{ - "Differences": differences, - } - if opt.HasAny(pdf.OptDictTypes) { - dict["Type"] = pdf.Name("Encoding") - } - - return dict, nil -} - -var errInvalidEncoding = errors.New("invalid encoding") diff --git a/font/encoding/enc2_test.go b/font/encoding/enc2_test.go deleted file mode 100644 index 29780540..00000000 --- a/font/encoding/enc2_test.go +++ /dev/null @@ -1,242 +0,0 @@ -// seehuhn.de/go/pdf - a library for reading and writing PDF files -// Copyright (C) 2024 Jochen Voss -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . - -package encoding - -import ( - "fmt" - "testing" - - "seehuhn.de/go/pdf" -) - -type MockGetter struct{} - -func (m *MockGetter) Get(ref pdf.Reference, canObjStm bool) (pdf.Native, error) { - return nil, nil -} - -func (m *MockGetter) GetMeta() *pdf.MetaInfo { - return nil -} - -func TestType1Encoding(t *testing.T) { - type mapping struct { - code byte - value string - } - type testCase struct { - name string - encoding pdf.Object - nonSymbolicExt bool - mappings []mapping - } - cases := []testCase{ - { - name: "nil encoding", - encoding: nil, - mappings: []mapping{ - {code: 0, value: UseBuiltin}, - {code: 1, value: UseBuiltin}, - {code: 255, value: UseBuiltin}, - }, - }, - { - name: "MacRomanEncoding", - encoding: pdf.Name("MacRomanEncoding"), - mappings: []mapping{ - {code: 0o101, value: "A"}, - {code: 0o256, value: "AE"}, - {code: 0o331, value: "Ydieresis"}, - }, - }, - { - name: "WinAnsiEncoding", - encoding: pdf.Name("WinAnsiEncoding"), - mappings: []mapping{ - {code: 0o101, value: "A"}, - {code: 0o306, value: "AE"}, - {code: 0o237, value: "Ydieresis"}, - }, - }, - { - name: "MacExpertEncoding", - encoding: pdf.Name("MacExpertEncoding"), - mappings: []mapping{ - {code: 0o276, value: "AEsmall"}, - {code: 0o207, value: "Aacutesmall"}, - {code: 0o342, value: "zerosuperior"}, - }, - }, - { - name: "dict/nil/true", - encoding: pdf.Dict{}, - nonSymbolicExt: true, - mappings: []mapping{ // standard encoding - {code: 0o101, value: "A"}, - {code: 0o341, value: "AE"}, - {code: 0o331, value: ".notdef"}, - }, - }, - { - name: "dict/nil/false", - encoding: pdf.Dict{}, - nonSymbolicExt: false, - mappings: []mapping{ // built-in encoding - {code: 0o101, value: UseBuiltin}, - {code: 0o341, value: UseBuiltin}, - {code: 0o331, value: UseBuiltin}, - }, - }, - { - name: "dict/MacRomanEncoding", - encoding: pdf.Dict{ - "BaseEncoding": pdf.Name("MacRomanEncoding"), - }, - mappings: []mapping{ - {code: 0o101, value: "A"}, - {code: 0o256, value: "AE"}, - {code: 0o331, value: "Ydieresis"}, - }, - }, - { - name: "dict/WinAnsiEncoding", - encoding: pdf.Dict{ - "BaseEncoding": pdf.Name("WinAnsiEncoding"), - }, - mappings: []mapping{ - {code: 0o101, value: "A"}, - {code: 0o306, value: "AE"}, - {code: 0o237, value: "Ydieresis"}, - }, - }, - { - name: "dict/MacExpertEncoding", - encoding: pdf.Dict{ - "BaseEncoding": pdf.Name("MacExpertEncoding"), - }, - mappings: []mapping{ - {code: 0o276, value: "AEsmall"}, - {code: 0o207, value: "Aacutesmall"}, - {code: 0o342, value: "zerosuperior"}, - }, - }, - { - name: "differences", - encoding: pdf.Dict{ - "BaseEncoding": pdf.Name("MacRomanEncoding"), - "Differences": pdf.Array{ - pdf.Integer(0o101), pdf.Name("B"), pdf.Name("A"), - pdf.Integer(0o177), pdf.Name("silly"), - }, - }, - mappings: []mapping{ - {code: 0o101, value: "B"}, - {code: 0o102, value: "A"}, - {code: 0o103, value: "C"}, - {code: 0o177, value: "silly"}, - }, - }, - } - r := &MockGetter{} - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - enc, err := ExtractType1(r, c.encoding, c.nonSymbolicExt) - if err != nil { - t.Fatal(err) - } - if enc == nil { - t.Fatal("expected encoding, got nil") - } - for _, m := range c.mappings { - if got := enc(m.code); got != m.value { - t.Errorf("encoding mismatch at %d: got %q, want %q", m.code, got, m.value) - } - } - }) - } -} - -func TestType1Roundtrip(t *testing.T) { - var cases = []Type1{ - Builtin, - WinAnsi, - MacRoman, - MacExpert, - Standard, - func(code byte) string { - switch code { - case 30: - return "Gandalf" - case 31: - return "Elrond" - case 32: - return "Galadriel" - case 100: - return "Gimli" - case 101: - return "Frodo" - case 102: - return "Sam" - default: - return WinAnsi(code) - } - }, - func(code byte) string { - switch code { - case 0: - return "Gandalf" - case 2: - return "Elrond" - case 4: - return "Galadriel" - case 126: - return "Gimli" - case 128: - return "Frodo" - case 130: - return "Sam" - default: - return Builtin(code) - } - }, - } - for i, enc1 := range cases { - for _, nonSymbolicExt := range []bool{true, false} { - t.Run(fmt.Sprintf("%d/%v", i, nonSymbolicExt), func(t *testing.T) { - obj, err := enc1.AsPDFType1(nonSymbolicExt, 0) - if err == errInvalidEncoding { - t.Skip("encoding cannot be represented as PDF object") - } - if err != nil { - t.Fatal(err) - } - - enc2, err := ExtractType1(&MockGetter{}, obj, nonSymbolicExt) - if err != nil { - t.Fatal(err) - } - - for code := range 256 { - if got, want := enc1(byte(code)), enc2(byte(code)); got != want { - t.Errorf("encoding mismatch at %d: got %q, want %q", code, got, want) - break - } - } - }) - } - } -} diff --git a/font/encoding/encoder.go b/font/encoding/encoder.go index 353d3ff2..f45a16fe 100644 --- a/font/encoding/encoder.go +++ b/font/encoding/encoder.go @@ -53,7 +53,7 @@ func NewSimpleEncoder() *SimpleEncoder { // WritingMode implements the [font.NewFont] interface. func (e *SimpleEncoder) WritingMode() cmap.WritingMode { - return 0 // simple fonts are always horizontal + return cmap.Horizontal // simple fonts are always horizontal } // GIDToCode returns the character code for the given glyph ID (allocating new diff --git a/font/encoding/encoding.go b/font/encoding/old.go similarity index 93% rename from font/encoding/encoding.go rename to font/encoding/old.go index 610091fc..305ae346 100644 --- a/font/encoding/encoding.go +++ b/font/encoding/old.go @@ -26,26 +26,30 @@ import ( "seehuhn.de/go/pdf/font/pdfenc" ) -// An Encoding describes a mapping between one-byte character codes and CIDs. +// An EncodingOld describes a mapping between one-byte character codes and CIDs. // // CID values can represent either glyph names, or entries in the built-in // encoding of a font. The interpretation of CID values is specific to the // encoder instance. CID 0 is reserved for unmapped codes. -type Encoding struct { +// +// TODO(voss): remove +// +// Deprecated: Use one of the other implementations instead. +type EncodingOld struct { enc [256]cmap.CID glyphNames []string } // New allocates a new Encoding object. -func New() *Encoding { - return &Encoding{} +func New() *EncodingOld { + return &EncodingOld{} } // Allocate allocates a new CID for a named glyph. // // If a CID has already been allocated for the glyph name, the same CID is // returned. Otherwise, a new CID is allocated and returned. -func (e *Encoding) Allocate(glyphName string) cmap.CID { +func (e *EncodingOld) Allocate(glyphName string) cmap.CID { if glyphName == "" { panic("encoding: missing glyph name") } @@ -66,7 +70,7 @@ func (e *Encoding) Allocate(glyphName string) cmap.CID { // UseBuiltinEncoding maps a character code to the corresponding glyph // of the built-in encoding. -func (e *Encoding) UseBuiltinEncoding(code byte) cmap.CID { +func (e *EncodingOld) UseBuiltinEncoding(code byte) cmap.CID { cid := 1 + cmap.CID(code) e.enc[code] = cid return cid @@ -75,7 +79,7 @@ func (e *Encoding) UseBuiltinEncoding(code byte) cmap.CID { // GlyphName returns the glyph name associated with a CID. // // For codes mapped via the built-in encoding, the empty string is returned. -func (e *Encoding) GlyphName(cid cmap.CID) string { +func (e *EncodingOld) GlyphName(cid cmap.CID) string { if cid == 0 { return ".notdef" } @@ -94,7 +98,7 @@ func (e *Encoding) GlyphName(cid cmap.CID) string { // Decode returns the CID associated with a character code. // If the code is not mapped, 0 is returned. -func (e *Encoding) Decode(code byte) cmap.CID { +func (e *EncodingOld) Decode(code byte) cmap.CID { return e.enc[code] } @@ -106,7 +110,7 @@ func (e *Encoding) Decode(code byte) cmap.CID { // // The resulting PDF object describes an encoding which maps all characters // mapped by e in the specified way, but it may also map additional codes. -func (e *Encoding) AsPDFType1(nonSymbolicExt bool, opt pdf.OutputOptions) (pdf.Object, error) { +func (e *EncodingOld) AsPDFType1(nonSymbolicExt bool, opt pdf.OutputOptions) (pdf.Object, error) { type candInfo struct { encName pdf.Native enc []string @@ -246,7 +250,7 @@ func (e *Encoding) AsPDFType1(nonSymbolicExt bool, opt pdf.OutputOptions) (pdf.O // // The glyph names for all mapped codes must be known (either via the encoding, // or via the builtin encoding). Otherwise an error is returned. -func (e *Encoding) AsPDFTrueType(builtin []string, opt pdf.OutputOptions) (pdf.Object, error) { +func (e *EncodingOld) AsPDFTrueType(builtin []string, opt pdf.OutputOptions) (pdf.Object, error) { // First check that all glyph names are known. for code := range 256 { cid := e.enc[code] @@ -339,7 +343,7 @@ candidateLoop: // font. // // On success, the function returns a [pdf.Dict] object. -func (e *Encoding) AsPDFType3(opt pdf.OutputOptions) (pdf.Object, error) { +func (e *EncodingOld) AsPDFType3(opt pdf.OutputOptions) (pdf.Object, error) { dict := pdf.Dict{} if opt.HasAny(pdf.OptDictTypes) { dict["Type"] = pdf.Name("Encoding") @@ -373,7 +377,7 @@ func (e *Encoding) AsPDFType3(opt pdf.OutputOptions) (pdf.Object, error) { // Deprecated: Use [ExtractType1] instead. // // TODO(voss): remove -func ExtractType1Old(r pdf.Getter, dicts *font.Dicts) (*Encoding, error) { +func ExtractType1Old(r pdf.Getter, dicts *font.Dicts) (*EncodingOld, error) { obj, err := pdf.Resolve(r, dicts.FontDict["Encoding"]) if err != nil { return nil, err @@ -443,7 +447,7 @@ func ExtractType1Old(r pdf.Getter, dicts *font.Dicts) (*Encoding, error) { return e, nil } -func ExtractTrueType(r pdf.Getter, obj pdf.Object) (*Encoding, error) { +func ExtractTrueType(r pdf.Getter, obj pdf.Object) (*EncodingOld, error) { obj, err := pdf.Resolve(r, obj) if err != nil { return nil, err @@ -519,7 +523,7 @@ func ExtractTrueType(r pdf.Getter, obj pdf.Object) (*Encoding, error) { return e, nil } -func ExtractType3Old(r pdf.Getter, obj pdf.Object) (*Encoding, error) { +func ExtractType3Old(r pdf.Getter, obj pdf.Object) (*EncodingOld, error) { dict, err := pdf.GetDictTyped(r, obj, "Encoding") if err != nil { return nil, err @@ -556,13 +560,13 @@ func ExtractType3Old(r pdf.Getter, obj pdf.Object) (*Encoding, error) { return e, nil } -func (e *Encoding) initBuiltinEncoding() { +func (e *EncodingOld) initBuiltinEncoding() { for code := range 256 { e.UseBuiltinEncoding(byte(code)) } } -func (e *Encoding) initNamedEncoding(name pdf.Name) error { +func (e *EncodingOld) initNamedEncoding(name pdf.Name) error { var enc []string switch name { case "WinAnsiEncoding": @@ -585,7 +589,7 @@ func (e *Encoding) initNamedEncoding(name pdf.Name) error { return nil } -func (e *Encoding) initStandardEncoding() { +func (e *EncodingOld) initStandardEncoding() { for code, name := range pdfenc.Standard.Encoding { if name == ".notdef" { continue diff --git a/font/encoding/encoding_test.go b/font/encoding/old_test.go similarity index 99% rename from font/encoding/encoding_test.go rename to font/encoding/old_test.go index 130e14ef..98202c54 100644 --- a/font/encoding/encoding_test.go +++ b/font/encoding/old_test.go @@ -285,7 +285,7 @@ var testBuiltinEncoding = []string{ } type sampleEncoding struct { - enc *Encoding + enc *EncodingOld names []string } @@ -390,7 +390,7 @@ func TestAsPDF(t *testing.T) { return } - var enc2 *Encoding + var enc2 *EncodingOld var obj pdf.Object var err error switch tp { diff --git a/font/encoding/truetype.go b/font/encoding/truetype.go new file mode 100644 index 00000000..4b31ad05 --- /dev/null +++ b/font/encoding/truetype.go @@ -0,0 +1,150 @@ +// seehuhn.de/go/pdf - a library for reading and writing PDF files +// Copyright (C) 2025 Jochen Voss +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package encoding + +import ( + "math/bits" + + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" + "seehuhn.de/go/pdf/font/cmap" + "seehuhn.de/go/pdf/font/pdfenc" + "seehuhn.de/go/postscript/type1/names" + "seehuhn.de/go/sfnt/glyph" +) + +type TrueTypeEncoder struct { + Encoding []glyph.ID + code map[key]byte + key map[byte]key +} + +// NewSimpleEncoder allocates a new SimpleEncoder. +func NewTrueTypeEncoder() *TrueTypeEncoder { + res := &TrueTypeEncoder{ + Encoding: make([]glyph.ID, 256), + code: make(map[key]byte), + key: make(map[byte]key), + } + return res +} + +// WritingMode implements the [font.NewFont] interface. +func (e *TrueTypeEncoder) WritingMode() cmap.WritingMode { + return cmap.Horizontal // simple fonts are always horizontal +} + +// GIDToCode returns the character code for the given glyph ID (allocating new +// codes as needed). It also records the fact that the character code +// corresponds to the given unicode string. +func (e *TrueTypeEncoder) GIDToCode(gid glyph.ID, rr []rune) byte { + k := key{gid, string(rr)} + + // Rules for choosing the code: + // 1. If the combination of `gid` and `rr` has previously been used, + // then use the same code as before. + code, seen := e.code[k] + if seen { + return code + } + + // 2. Allocate a new code based on the last rune in rr. + var r rune + if len(rr) > 0 { + r = rr[len(rr)-1] + } + code = e.allocateCode(r) + e.Encoding[code] = gid + e.code[k] = code + e.key[code] = k + + return code +} + +func (e *TrueTypeEncoder) allocateCode(r rune) byte { + if len(e.code) >= 256 { + // Once all codes are used up, simply return 0 for everything. + return 0 + } + bestScore := -1 + bestCode := byte(0) + for codeInt := 0; codeInt < 256; codeInt++ { + code := byte(codeInt) + if _, alreadyUsed := e.key[code]; alreadyUsed { + continue + } + var score int + q := rune(code) + winAnsiName := pdfenc.WinAnsi.Encoding[code] + if winAnsiName == ".notdef" { + // fill up the unused slots first + score += 100 + } else { + q = names.ToUnicode(winAnsiName, false)[0] + if q == r { + // If r is in the WinAnsi encoding, and the corresponding + // code is still free, then use it. + bestCode = code + break + } else if !(code == 32 && r != ' ') { + // Try to keep code 32 for the space character, + // in order to not break the PDF word spacing parameter. + score += 10 + } + } + score += bits.TrailingZeros16(uint16(r) ^ uint16(q)) + if score > bestScore { + bestScore = score + bestCode = code + } + } + return bestCode +} + +// CodeIsUsed returns true if the given code has already been allocated. +// This can be used to distinguish between codes which have +// explicitly been mapped to GID 0 and codes which are not used. +func (e *TrueTypeEncoder) CodeIsUsed(code byte) bool { + _, used := e.key[code] + return used +} + +// Overflow returns true if the encoder has run out of codes. +func (e *TrueTypeEncoder) Overflow() bool { + return len(e.code) > 256 +} + +// Subset returns the subset of glyph IDs which are used by this encoder. +// The result is sorted and always include the glyph ID 0. +func (e *TrueTypeEncoder) Subset() []glyph.ID { + gidUsed := make(map[glyph.ID]bool, len(e.code)+1) + gidUsed[0] = true + for key := range e.code { + gidUsed[key.gid] = true + } + subset := maps.Keys(gidUsed) + slices.Sort(subset) + return subset +} + +// ToUnicodeNew returns the mapping from character codes to unicode strings. +// This can be used to construct a PDF ToUnicode CMap. +func (e *TrueTypeEncoder) FillText(text *[256]string) { + for k, c := range e.code { + (*text)[c] = k.rr + } +} diff --git a/font/encoding/type1.go b/font/encoding/type1.go index 751daed1..4102a2c7 100644 --- a/font/encoding/type1.go +++ b/font/encoding/type1.go @@ -1,5 +1,5 @@ // seehuhn.de/go/pdf - a library for reading and writing PDF files -// Copyright (C) 2023 Jochen Voss +// Copyright (C) 2024 Jochen Voss // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -17,150 +17,337 @@ package encoding import ( - "bytes" "errors" - "fmt" - "math" "seehuhn.de/go/pdf" "seehuhn.de/go/pdf/font/pdfenc" ) -// DescribeEncodingType1 returns the /Encoding entry for the font dictionary -// of a Type 1 font. The arguments are the encoding used by the client, -// and the font's builtin encoding. +// Type1 gives the glyph name for each code point. +// The empty string indicates unused codes. +// The special value [UseBuiltin] indicates that the corresponding glyph from +// the built-in encoding should be used. +type Type1 func(code byte) string + +const UseBuiltin = "@" + +var ( + Builtin Type1 = func(code byte) string { + return UseBuiltin + } + WinAnsi Type1 = func(code byte) string { + return pdfenc.WinAnsi.Encoding[code] + } + MacRoman Type1 = func(code byte) string { + return pdfenc.MacRoman.Encoding[code] + } + MacExpert Type1 = func(code byte) string { + return pdfenc.MacExpert.Encoding[code] + } + Standard Type1 = func(code byte) string { + return pdfenc.Standard.Encoding[code] + } +) + +// ExtractType1 extracts the encoding from the /Encoding entry of a Type1 +// font dictionary. // -// See section 9.6.1 and 9.6.5 of ISO 32000-2:2020. -func DescribeEncodingType1(encoding, builtin []string) pdf.Object { - type cand struct { - name pdf.Object - enc []string - } - candidates := []cand{ - {nil, builtin}, - {pdf.Name("WinAnsiEncoding"), pdfenc.WinAnsi.Encoding[:]}, - {pdf.Name("MacRomanEncoding"), pdfenc.MacRoman.Encoding[:]}, - {pdf.Name("MacExpertEncoding"), pdfenc.MacExpert.Encoding[:]}, - } - - type D struct { - code int - newName pdf.Name - } - var diff []D - var desc pdf.Dict - descLen := math.MaxInt - for _, cand := range candidates { - diff = diff[:0] - for code, name := range encoding { - if name != ".notdef" && name != cand.enc[code] { - diff = append(diff, D{code, pdf.Name(name)}) +// If the argument nonSymbolicExt is true, the function assumes that the font +// has the non-symbolic flag set in the font descriptor and that the font is +// not embedded in the PDF file. +// +// If /Encoding is malformed, the font's built-in encoding is used as a +// fallback. +func ExtractType1(r pdf.Getter, obj pdf.Object, nonSymbolicExt bool) (Type1, error) { + obj, err := pdf.Resolve(r, obj) + if err != nil { + return nil, err + } + + if name, ok := obj.(pdf.Name); ok { + switch name { + case "WinAnsiEncoding": + return WinAnsi, nil + case "MacRomanEncoding": + return MacRoman, nil + case "MacExpertEncoding": + return MacExpert, nil + } + } + + dict, _ := obj.(pdf.Dict) + if dict == nil { + return Builtin, nil + } + if err := pdf.CheckDictType(r, dict, "Encoding"); err != nil { + return Builtin, err + } + + // If we reach this point, we have found an encoding dictionary. + + var baseEnc Type1 + baseEncName, _ := pdf.GetName(r, dict["BaseEncoding"]) + switch baseEncName { + case "WinAnsiEncoding": + baseEnc = WinAnsi + case "MacRomanEncoding": + baseEnc = MacRoman + case "MacExpertEncoding": + baseEnc = MacExpert + default: + if nonSymbolicExt { // non-symbolic and not embedded + baseEnc = Standard + } else { // symbolic or embedded + baseEnc = Builtin + } + } + + differences := make(map[byte]string) + if diffArray, _ := pdf.GetArray(r, dict["Differences"]); diffArray != nil { + currentCode := pdf.Integer(-1) + for _, item := range diffArray { + item, err = pdf.Resolve(r, item) + if err != nil { + return nil, err + } + + switch item := item.(type) { + case pdf.Integer: + currentCode = item + + case pdf.Name: + if currentCode >= 0 && currentCode < 256 { + differences[byte(currentCode)] = string(item) + currentCode++ + } } } - if len(diff) == 0 { - return cand.name + } + if len(differences) == 0 { + return baseEnc, nil + } + + return func(code byte) string { + if glyphName, ok := differences[code]; ok { + return glyphName + } + return baseEnc(code) + }, nil +} + +// AsPDFType1 returns the /Encoding entry for Type1 font dictionary. +// +// If the argument baseIsStd is true, Differences arrays record changes from +// the standard encoding. Otherwise, Differences arrays record changes from the +// built-in encoding. The flag should be set if the font is non-symbolic and is +// not be embedded in the PDF file. If the flag is set, the built-in encoding +// must either be used for all mapped codes, or not at all. +// +// The resulting PDF object describes an encoding which maps all characters +// mapped by e to the given glyph name, but it may also imply glyph names for +// the unmapped codes. +func (e Type1) AsPDFType1(baseIsStd bool, opt pdf.OutputOptions) (pdf.Object, error) { + type candInfo struct { + encName pdf.Native + enc []string + differences pdf.Array + } + + // First check whether we can use the built-in encoding. + canUseBuiltin := true + for code := range 256 { + if e(byte(code)) != "" && e(byte(code)) != UseBuiltin { + canUseBuiltin = false + break + } + } + if canUseBuiltin { + return nil, nil + } + + // Next, if no codes are mapped to the built-in encoding, we may be able to + // use a named encoding. + noBuiltin := true + for code := range 256 { + if e(byte(code)) == UseBuiltin { + noBuiltin = false + break + } + } + if noBuiltin { + candidates := []*candInfo{ + {encName: pdf.Name("WinAnsiEncoding"), enc: pdfenc.WinAnsi.Encoding[:]}, + {encName: pdf.Name("MacRomanEncoding"), enc: pdfenc.MacRoman.Encoding[:]}, + {encName: pdf.Name("MacExpertEncoding"), enc: pdfenc.MacExpert.Encoding[:]}, + } + candidateLoop: + for _, cand := range candidates { + for code := range 256 { + if glyphName := e(byte(code)); glyphName != "" && glyphName != cand.enc[code] { + // we got a conflict, try the next candidate + continue candidateLoop + } + } + return cand.encName, nil } + } + + // If we reach this point, we need an encoding dictionary. We choose the + // base encoding which leads to the smallest Differences array. - newDesc := pdf.Dict{} - if cand.name != nil { - newDesc["BaseEncoding"] = cand.name + var candidates []*candInfo + if noBuiltin { + candidates = []*candInfo{ + {encName: pdf.Name("WinAnsiEncoding"), enc: pdfenc.WinAnsi.Encoding[:]}, + {encName: pdf.Name("MacRomanEncoding"), enc: pdfenc.MacRoman.Encoding[:]}, + {encName: pdf.Name("MacExpertEncoding"), enc: pdfenc.MacExpert.Encoding[:]}, } - var a pdf.Array - prev := 256 - for _, d := range diff { - if d.code != prev+1 { - a = append(a, pdf.Integer(d.code)) + if baseIsStd { + // If a font is marked as non-symbolic in the font descriptor and + // the font is not embedded, a missing `BaseEncoding` field + // represents the standard encoding. + candidates = append(candidates, + &candInfo{encName: nil, enc: pdfenc.Standard.Encoding[:]}, + ) + } + for _, cand := range candidates { + lastDiff := 999 + for code := range 256 { + glyphName := e(byte(code)) + if glyphName == "" || glyphName == cand.enc[code] { + continue + } + + if code != lastDiff+1 { + cand.differences = append(cand.differences, pdf.Integer(code)) + } + cand.differences = append(cand.differences, pdf.Name(glyphName)) + lastDiff = code } - a = append(a, d.newName) - prev = d.code } - newDesc["Differences"] = a + } else { + if baseIsStd { + // If the font is marked as non-symbolic in the font descriptor and + // the font is not embedded, a missing `BaseEncoding` field + // represents the standard encoding. In this case, there is no way + // to refer to the built-in encoding. + return nil, errInvalidEncoding + } + + var diff pdf.Array + lastDiff := 999 + for code := range 256 { + glyphName := e(byte(code)) + if glyphName == "" || glyphName == UseBuiltin { + continue + } - b := &bytes.Buffer{} - pdf.Format(b, 0, newDesc) - if b.Len() < descLen { - desc = newDesc - descLen = b.Len() + if code != lastDiff+1 { + diff = append(diff, pdf.Integer(code)) + } + diff = append(diff, pdf.Name(glyphName)) + lastDiff = code } + + candidates = append(candidates, &candInfo{ + encName: nil, + differences: diff, + }) } - return desc + // candidates is non-empty at this point + + var bestDict pdf.Dict + bestDiffLength := 999 + for _, cand := range candidates { + if L := len(cand.differences); L < bestDiffLength { + bestDiffLength = L + bestDict = pdf.Dict{} + if cand.encName != nil { + bestDict["BaseEncoding"] = cand.encName + } + if L > 0 { + bestDict["Differences"] = cand.differences + } + } + } + if opt.HasAny(pdf.OptDictTypes) { + bestDict["Type"] = pdf.Name("Encoding") + } + return bestDict, nil } -// UndescribeEncodingType1 returns the encoding used by the client, given -// the /Encoding entry for the font dictionary of a Type 1 font and the -// font's builtin encoding. -// -// This function is nearly the inverse of [DescribeEncodingType1]: if -// the name assigned to a code is not `.notdef`, then [DescribeEncodingType1] -// followed by [UndescribeEncodingType1] will return the same name. -func UndescribeEncodingType1(r pdf.Getter, desc pdf.Object, builtin []string) ([]string, error) { - desc, err := pdf.Resolve(r, desc) +// ExtractType3 extracts the encoding from the /Encoding entry of a Type3 +// font dictionary. +func ExtractType3(r pdf.Getter, obj pdf.Object) (Type1, error) { + dict, err := pdf.GetDictTyped(r, obj, "Encoding") if err != nil { return nil, err } - switch desc := desc.(type) { - case nil: - return builtin, nil - case pdf.Name: - return getNamedEncoding(desc) - case pdf.Dict: - base, err := pdf.GetName(r, desc["BaseEncoding"]) + diffArray, err := pdf.GetArray(r, dict["Differences"]) + if err != nil { + return nil, err + } + + differences := make(map[byte]string) + + currentCode := pdf.Integer(-1) + for _, item := range diffArray { + item, err = pdf.Resolve(r, item) if err != nil { return nil, err } - res := make([]string, 256) - baseEnc := builtin - if base != "" { - baseEnc, err = getNamedEncoding(base) - if err != nil { - return nil, err + + switch item := item.(type) { + case pdf.Integer: + currentCode = item + + case pdf.Name: + if currentCode >= 0 && currentCode < 256 { + differences[byte(currentCode)] = string(item) + currentCode++ } } - if baseEnc == nil { - return nil, errors.New("encoding: invalid base encoding") - } - copy(res, baseEnc) + } - a, err := pdf.GetArray(r, desc["Differences"]) - if err != nil { - return nil, err + if len(differences) == 0 { + return nil, &pdf.MalformedFileError{ + Err: errors.New("missing /Differences array"), } - code := -1 - for _, x := range a { - switch x := x.(type) { - case pdf.Integer: - if x < 0 || x >= 256 { - return nil, fmt.Errorf("encoding: invalid code %d", x) - } - code = int(x) - case pdf.Name: - if code < 0 || code >= 256 { - return nil, fmt.Errorf("encoding: invalid code %d", code) - } - res[code] = string(x) - code++ - default: - return nil, fmt.Errorf("encoding: expected Integer or Name, got %T", x) - } + } + + return func(code byte) string { + return differences[code] + }, nil +} + +// AsPDFType3 returns the /Encoding entry for Type3 font dictionary. +func (e Type1) AsPDFType3(opt pdf.OutputOptions) (pdf.Object, error) { + var differences pdf.Array + + lastDiff := 999 + for code := range 256 { + glyphName := e(byte(code)) + if glyphName == "" { + continue } - return res, nil - default: - return nil, fmt.Errorf("encoding: expected Name or Dict, got %T", desc) + if code != lastDiff+1 { + differences = append(differences, pdf.Integer(code)) + } + differences = append(differences, pdf.Name(glyphName)) + lastDiff = code } -} -func getNamedEncoding(name pdf.Name) ([]string, error) { - switch name { - case "WinAnsiEncoding": - return pdfenc.WinAnsi.Encoding[:], nil - case "MacRomanEncoding": - return pdfenc.MacRoman.Encoding[:], nil - case "MacExpertEncoding": - return pdfenc.MacExpert.Encoding[:], nil - default: - return nil, fmt.Errorf("unknown encoding %q", name) + dict := pdf.Dict{ + "Differences": differences, } + if opt.HasAny(pdf.OptDictTypes) { + dict["Type"] = pdf.Name("Encoding") + } + + return dict, nil } + +var errInvalidEncoding = errors.New("invalid encoding") diff --git a/font/encoding/type1_test.go b/font/encoding/type1_test.go index 7f71f2dc..29780540 100644 --- a/font/encoding/type1_test.go +++ b/font/encoding/type1_test.go @@ -1,5 +1,5 @@ // seehuhn.de/go/pdf - a library for reading and writing PDF files -// Copyright (C) 2023 Jochen Voss +// Copyright (C) 2024 Jochen Voss // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -17,64 +17,226 @@ package encoding import ( + "fmt" "testing" - "seehuhn.de/go/pdf/font/pdfenc" + "seehuhn.de/go/pdf" ) -func TestDescribeEncoding(t *testing.T) { - funnyEncoding := make([]string, 256) - for i := range funnyEncoding { - funnyEncoding[i] = ".notdef" - } - funnyEncoding[0o001] = "funny" // non-standard name - funnyEncoding[0o101] = "A" // common to all encodings - funnyEncoding[0o102] = "C" // clashes with all encodings - funnyEncoding[0o103] = "B" // clashes with all encodings - funnyEncoding[0o104] = "D" // common to all encodings - funnyEncoding[0o142] = "Bsmall" // only in MacExpertEncoding - funnyEncoding[0o201] = "A" // double encode some characters - funnyEncoding[0o202] = "B" // double encode some characters - funnyEncoding[0o203] = "C" // double encode some characters - funnyEncoding[0o204] = "D" // double encode some characters - funnyEncoding[0o214] = "OE" // only in WinAnsiEncoding - funnyEncoding[0o227] = "Scaron" // only in PdfDocEncoding - funnyEncoding[0o341] = "AE" // only in StandardEncoding - funnyEncoding[0o347] = "Aacute" // only in MacRomanEncoding +type MockGetter struct{} - encodings := [][]string{ - pdfenc.Standard.Encoding[:], - pdfenc.MacRoman.Encoding[:], - pdfenc.MacExpert.Encoding[:], - pdfenc.WinAnsi.Encoding[:], - pdfenc.PDFDoc.Encoding[:], - funnyEncoding, - } +func (m *MockGetter) Get(ref pdf.Reference, canObjStm bool) (pdf.Native, error) { + return nil, nil +} + +func (m *MockGetter) GetMeta() *pdf.MetaInfo { + return nil +} - for i, enc := range encodings { - for j, builtin := range encodings { - desc := DescribeEncodingType1(enc, builtin) - if i == j { - if desc != nil { - t.Errorf("DescribeEncoding(%d, %d) = %v", i, j, desc) +func TestType1Encoding(t *testing.T) { + type mapping struct { + code byte + value string + } + type testCase struct { + name string + encoding pdf.Object + nonSymbolicExt bool + mappings []mapping + } + cases := []testCase{ + { + name: "nil encoding", + encoding: nil, + mappings: []mapping{ + {code: 0, value: UseBuiltin}, + {code: 1, value: UseBuiltin}, + {code: 255, value: UseBuiltin}, + }, + }, + { + name: "MacRomanEncoding", + encoding: pdf.Name("MacRomanEncoding"), + mappings: []mapping{ + {code: 0o101, value: "A"}, + {code: 0o256, value: "AE"}, + {code: 0o331, value: "Ydieresis"}, + }, + }, + { + name: "WinAnsiEncoding", + encoding: pdf.Name("WinAnsiEncoding"), + mappings: []mapping{ + {code: 0o101, value: "A"}, + {code: 0o306, value: "AE"}, + {code: 0o237, value: "Ydieresis"}, + }, + }, + { + name: "MacExpertEncoding", + encoding: pdf.Name("MacExpertEncoding"), + mappings: []mapping{ + {code: 0o276, value: "AEsmall"}, + {code: 0o207, value: "Aacutesmall"}, + {code: 0o342, value: "zerosuperior"}, + }, + }, + { + name: "dict/nil/true", + encoding: pdf.Dict{}, + nonSymbolicExt: true, + mappings: []mapping{ // standard encoding + {code: 0o101, value: "A"}, + {code: 0o341, value: "AE"}, + {code: 0o331, value: ".notdef"}, + }, + }, + { + name: "dict/nil/false", + encoding: pdf.Dict{}, + nonSymbolicExt: false, + mappings: []mapping{ // built-in encoding + {code: 0o101, value: UseBuiltin}, + {code: 0o341, value: UseBuiltin}, + {code: 0o331, value: UseBuiltin}, + }, + }, + { + name: "dict/MacRomanEncoding", + encoding: pdf.Dict{ + "BaseEncoding": pdf.Name("MacRomanEncoding"), + }, + mappings: []mapping{ + {code: 0o101, value: "A"}, + {code: 0o256, value: "AE"}, + {code: 0o331, value: "Ydieresis"}, + }, + }, + { + name: "dict/WinAnsiEncoding", + encoding: pdf.Dict{ + "BaseEncoding": pdf.Name("WinAnsiEncoding"), + }, + mappings: []mapping{ + {code: 0o101, value: "A"}, + {code: 0o306, value: "AE"}, + {code: 0o237, value: "Ydieresis"}, + }, + }, + { + name: "dict/MacExpertEncoding", + encoding: pdf.Dict{ + "BaseEncoding": pdf.Name("MacExpertEncoding"), + }, + mappings: []mapping{ + {code: 0o276, value: "AEsmall"}, + {code: 0o207, value: "Aacutesmall"}, + {code: 0o342, value: "zerosuperior"}, + }, + }, + { + name: "differences", + encoding: pdf.Dict{ + "BaseEncoding": pdf.Name("MacRomanEncoding"), + "Differences": pdf.Array{ + pdf.Integer(0o101), pdf.Name("B"), pdf.Name("A"), + pdf.Integer(0o177), pdf.Name("silly"), + }, + }, + mappings: []mapping{ + {code: 0o101, value: "B"}, + {code: 0o102, value: "A"}, + {code: 0o103, value: "C"}, + {code: 0o177, value: "silly"}, + }, + }, + } + r := &MockGetter{} + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + enc, err := ExtractType1(r, c.encoding, c.nonSymbolicExt) + if err != nil { + t.Fatal(err) + } + if enc == nil { + t.Fatal("expected encoding, got nil") + } + for _, m := range c.mappings { + if got := enc(m.code); got != m.value { + t.Errorf("encoding mismatch at %d: got %q, want %q", m.code, got, m.value) } } + }) + } +} - enc2, err := UndescribeEncodingType1(nil, desc, builtin) - if err != nil { - t.Error(err) - continue +func TestType1Roundtrip(t *testing.T) { + var cases = []Type1{ + Builtin, + WinAnsi, + MacRoman, + MacExpert, + Standard, + func(code byte) string { + switch code { + case 30: + return "Gandalf" + case 31: + return "Elrond" + case 32: + return "Galadriel" + case 100: + return "Gimli" + case 101: + return "Frodo" + case 102: + return "Sam" + default: + return WinAnsi(code) + } + }, + func(code byte) string { + switch code { + case 0: + return "Gandalf" + case 2: + return "Elrond" + case 4: + return "Galadriel" + case 126: + return "Gimli" + case 128: + return "Frodo" + case 130: + return "Sam" + default: + return Builtin(code) } + }, + } + for i, enc1 := range cases { + for _, nonSymbolicExt := range []bool{true, false} { + t.Run(fmt.Sprintf("%d/%v", i, nonSymbolicExt), func(t *testing.T) { + obj, err := enc1.AsPDFType1(nonSymbolicExt, 0) + if err == errInvalidEncoding { + t.Skip("encoding cannot be represented as PDF object") + } + if err != nil { + t.Fatal(err) + } - for c, name := range enc { - if name == ".notdef" { - continue + enc2, err := ExtractType1(&MockGetter{}, obj, nonSymbolicExt) + if err != nil { + t.Fatal(err) } - if enc2[c] != name { - t.Errorf("UndescribeEncoding(%d, %d) = %v", i, j, enc2) - break + + for code := range 256 { + if got, want := enc1(byte(code)), enc2(byte(code)); got != want { + t.Errorf("encoding mismatch at %d: got %q, want %q", code, got, want) + break + } } - } + }) } } } diff --git a/font/opentype/cidcff.go b/font/opentype/cidcff.go index 414faf03..d0d744d4 100644 --- a/font/opentype/cidcff.go +++ b/font/opentype/cidcff.go @@ -22,7 +22,6 @@ import ( "seehuhn.de/go/geom/matrix" pscid "seehuhn.de/go/postscript/cid" - "seehuhn.de/go/postscript/funit" "seehuhn.de/go/sfnt" "seehuhn.de/go/sfnt/cff" @@ -30,10 +29,10 @@ import ( "seehuhn.de/go/pdf" "seehuhn.de/go/pdf/font" + "seehuhn.de/go/pdf/font/cidfont" "seehuhn.de/go/pdf/font/cmap" "seehuhn.de/go/pdf/font/pdfenc" "seehuhn.de/go/pdf/font/subset" - "seehuhn.de/go/pdf/font/widths" ) type embeddedCFFComposite struct { @@ -69,7 +68,7 @@ func (f *embeddedCFFComposite) AppendEncoded(s pdf.String, gid glyph.ID, rr []ru return s, width } -func (f *embeddedCFFComposite) Finish(*pdf.ResourceManager) error { +func (f *embeddedCFFComposite) Finish(rm *pdf.ResourceManager) error { if f.closed { return nil } @@ -96,9 +95,6 @@ func (f *embeddedCFFComposite) Finish(*pdf.ResourceManager) error { } ros := f.ROS() - toUnicode := f.ToUnicode() - - cmapInfo := f.CMap() // If the CFF font is CID-keyed, *i.e.* if it contain a `ROS` operator, // then the `charset` table in the CFF font describes the mapping from CIDs @@ -110,8 +106,10 @@ func (f *embeddedCFFComposite) Finish(*pdf.ResourceManager) error { break } } + outlines := subsetOTF.Outlines.(*cff.Outlines) mustUseCID := len(outlines.Private) > 1 + if isIdentity && !mustUseCID { // Make the font non-CID-keyed. outlines.Encoding = cff.StandardEncoding(outlines.Glyphs) outlines.ROS = nil @@ -119,8 +117,8 @@ func (f *embeddedCFFComposite) Finish(*pdf.ResourceManager) error { } else { // Make the font CID-keyed. outlines.Encoding = nil var sup int32 - if ros.Supplement > 0 && ros.Supplement < 0x1000_0000 { - sup = int32(ros.Supplement) + if sup32 := int32(ros.Supplement); ros.Supplement == pdf.Integer(sup32) { + sup = sup32 } outlines.ROS = &pscid.SystemInfo{ Registry: ros.Registry, @@ -128,236 +126,76 @@ func (f *embeddedCFFComposite) Finish(*pdf.ResourceManager) error { Supplement: sup, } outlines.GIDToCID = gidToCID + outlines.FontMatrices = make([]matrix.Matrix, len(outlines.Private)) for i := range outlines.Private { outlines.FontMatrices[i] = matrix.Identity } } - info := FontDictCFFComposite{ - Font: subsetOTF, - SubsetTag: subsetTag, - CMap: cmapInfo, - ToUnicode: toUnicode, - } - return info.Embed(f.w, f.ref) -} - -// FontDictCFFComposite is the information needed to embed a composite OpenType/CFF font. -type FontDictCFFComposite struct { - // Font is the font to embed (already subsetted, if needed). - Font *sfnt.Font - - // SubsetTag should be a unique tag for the font subset, - // or the empty string if this is the full font. - SubsetTag string - - CMap *cmap.FileOld - - IsAllCap bool - IsSmallCap bool - - // ToUnicode (optional) is a map from character codes to unicode strings. - ToUnicode *cmap.ToUnicodeOld -} - -// ExtractCFFComposite extracts information about a composite OpenType/CFF font from a PDF file. -// This is the inverse of [FontDictCFFComposite.Embed]. -func ExtractCFFComposite(r pdf.Getter, dicts *font.Dicts) (*FontDictCFFComposite, error) { - if err := dicts.FontTypeOld.MustBe(font.OpenTypeCFFComposite); err != nil { - return nil, err - } - - res := &FontDictCFFComposite{} - - stmObj, err := pdf.GetStream(r, dicts.FontData) - if err != nil { - return nil, err - } - stmData, err := pdf.DecodeStream(r, stmObj, 0) - if err != nil { - return nil, err - } - otf, err := sfnt.Read(stmData) - if err != nil { - return nil, err - } - if _, ok := otf.Outlines.(*cff.Outlines); !ok { - return nil, fmt.Errorf("expected CFF outlines, got %T", otf.Outlines) - } - - // Most OpenType tables will be missing, so we fill in information from - // the font descriptor instead. - otf.IsSerif = dicts.FontDescriptor.IsSerif - otf.IsScript = dicts.FontDescriptor.IsScript - q := 1000 / float64(otf.UnitsPerEm) - otf.Ascent = funit.Int16(math.Round(float64(dicts.FontDescriptor.Ascent) / q)) - otf.Descent = funit.Int16(math.Round(float64(dicts.FontDescriptor.Descent) / q)) - otf.CapHeight = funit.Int16(math.Round(float64(dicts.FontDescriptor.CapHeight) / q)) - res.Font = otf - - postScriptName, _ := pdf.GetName(r, dicts.CIDFontDict["BaseFont"]) - if m := subset.TagRegexp.FindStringSubmatch(string(postScriptName)); m != nil { - res.SubsetTag = m[1] - } - - cmapInfo, err := cmap.Extract(r, dicts.FontDict["Encoding"]) - if err != nil { - return nil, err - } - res.CMap = cmapInfo - - res.IsAllCap = dicts.FontDescriptor.IsAllCap - res.IsSmallCap = dicts.FontDescriptor.IsSmallCap - - if info, _ := cmap.ExtractToUnicodeOld(r, dicts.FontDict["ToUnicode"], cmapInfo.CodeSpaceRange); info != nil { - res.ToUnicode = info - } - - return res, nil -} - -// Embed adds a composite OpenType/CFF font to a PDF file. -// This implements the [font.Dict] interface. -// This is the reverse of [ExtractCFFComposite] -func (info *FontDictCFFComposite) Embed(w *pdf.Writer, fontDictRef pdf.Reference) error { - err := pdf.CheckVersion(w, "composite OpenType/CFF fonts", pdf.V1_6) - if err != nil { - return err - } + postScriptName := subsetOTF.PostScriptName() - sfnt := info.Font - if !sfnt.IsCFF() { - return fmt.Errorf("not an OpenType/CFF font") - } - cff := sfnt.AsCFF() - - cidFontName := cff.FontInfo.FontName - if info.SubsetTag != "" { - cidFontName = info.SubsetTag + "+" + cidFontName - } - - // make a PDF CMap - cmapInfo := info.CMap - var encoding pdf.Object - if cmapInfo.IsPredefined() { - encoding = pdf.Name(cmapInfo.Name) - } else { - encoding = w.Alloc() + ww := make(map[cmap.CID]float64) + for gid, cid := range gidToCID { + ww[cid] = subsetOTF.GlyphWidthPDF(glyph.ID(gid)) } + dw := subsetOTF.GlyphWidthPDF(0) - glyphwidths := sfnt.Widths() - ww := make(map[cmap.CID]float64, len(glyphwidths)) - if cff.GIDToCID != nil { - for gid, w := range glyphwidths { - ww[cmap.CID(cff.GIDToCID[gid])] = float64(w) * sfnt.FontMatrix[0] * 1000 + isSymbolic := false + for _, g := range outlines.Glyphs { + name := g.Name // TODO(voss): is this correct? + if name == ".notdef" { + continue } - } else { - for gid, w := range glyphwidths { - ww[cmap.CID(gid)] = float64(w) * sfnt.FontMatrix[0] * 1000 + if !pdfenc.StandardLatin.Has[name] { + isSymbolic = true + break } } - W, DW := widths.EncodeComposite(ww, pdf.GetVersion(w)) - - q := 1000 / float64(sfnt.UnitsPerEm) - fontBBox := sfnt.FontBBoxPDF() - - // isSymbolic := !font.IsNonSymbolic(sfnt) - isSymbolic := !pdfenc.IsNonSymbolic(sfnt.MakeGlyphNames()) - - cidFontRef := w.Alloc() - var toUnicodeRef pdf.Reference - fontDescriptorRef := w.Alloc() - fontFileRef := w.Alloc() - - fontDict := pdf.Dict{ - "Type": pdf.Name("Font"), - "Subtype": pdf.Name("Type0"), - "BaseFont": pdf.Name(cidFontName + "-" + cmapInfo.Name), - "Encoding": encoding, - "DescendantFonts": pdf.Array{cidFontRef}, - } - if info.ToUnicode != nil { - toUnicodeRef = w.Alloc() - fontDict["ToUnicode"] = toUnicodeRef - } - - ROS := pdf.Dict{ - "Registry": pdf.String(info.CMap.ROS.Registry), - "Ordering": pdf.String(info.CMap.ROS.Ordering), - "Supplement": pdf.Integer(info.CMap.ROS.Supplement), - } - cidFontDict := pdf.Dict{ - "Type": pdf.Name("Font"), - "Subtype": pdf.Name("CIDFontType0"), - "BaseFont": pdf.Name(cidFontName), - "CIDSystemInfo": ROS, - "FontDescriptor": fontDescriptorRef, + qh := subsetOTF.FontMatrix[0] * 1000 // TODO(voss): is this correct for CID-keyed fonts? + qv := subsetOTF.FontMatrix[3] * 1000 + ascent := subsetOTF.Ascent.AsFloat(qv) + descent := subsetOTF.Descent.AsFloat(qv) + lineGap := subsetOTF.LineGap.AsFloat(qv) + var leading float64 + if lineGap > 0 { + leading = ascent - descent + lineGap } - if DW != 1000 { - cidFontDict["DW"] = pdf.Number(DW) - } - if W != nil { - cidFontDict["W"] = W - } - fd := &font.Descriptor{ - FontName: cidFontName, - IsFixedPitch: cff.IsFixedPitch, - IsSerif: sfnt.IsSerif, + FontName: subset.Join(subsetTag, postScriptName), + FontFamily: subsetOTF.FamilyName, + FontStretch: subsetOTF.Width, + FontWeight: subsetOTF.Weight, + IsFixedPitch: subsetOTF.IsFixedPitch(), + IsSerif: subsetOTF.IsSerif, IsSymbolic: isSymbolic, - IsScript: sfnt.IsScript, - IsItalic: sfnt.IsItalic, - IsAllCap: info.IsAllCap, - IsSmallCap: info.IsSmallCap, - ForceBold: cff.Private[0].ForceBold, - FontBBox: fontBBox.Rounded(), - ItalicAngle: sfnt.ItalicAngle, - Ascent: sfnt.Ascent.AsFloat(q), - Descent: sfnt.Descent.AsFloat(q), - CapHeight: sfnt.CapHeight.AsFloat(q), - } - fontDescriptor := fd.AsDict() - fontDescriptor["FontFile3"] = fontFileRef - - compressedRefs := []pdf.Reference{fontDictRef, cidFontRef, fontDescriptorRef} - compressedObjects := []pdf.Object{fontDict, cidFontDict, fontDescriptor} - err = w.WriteCompressed(compressedRefs, compressedObjects...) - if err != nil { - return pdf.Wrap(err, "composite OpenType/CFF font dicts") - } - - // See section 9.9 of PDF 32000-1:2008 for details. - fontFileDict := pdf.Dict{ - "Subtype": pdf.Name("OpenType"), - } - fontFileStream, err := w.OpenStream(fontFileRef, fontFileDict, pdf.FilterCompress{}) - if err != nil { - return err - } - err = sfnt.WriteOpenTypeCFFPDF(fontFileStream) - if err != nil { - return fmt.Errorf("OpenType/CFF font program %q: %w", cidFontName, err) - } - err = fontFileStream.Close() - if err != nil { - return err - } - - if ref, ok := encoding.(pdf.Reference); ok { - err = cmapInfo.Embed(w, ref, nil) - if err != nil { - return err - } - } - - if toUnicodeRef != 0 { - err = info.ToUnicode.Embed(w, toUnicodeRef) - if err != nil { - return err - } - } - - return nil + IsScript: subsetOTF.IsScript, + IsItalic: subsetOTF.IsItalic, + ForceBold: outlines.Private[0].ForceBold, + FontBBox: subsetOTF.FontBBoxPDF().Rounded(), + ItalicAngle: subsetOTF.ItalicAngle, + Ascent: math.Round(ascent), + Descent: math.Round(descent), + Leading: math.Round(leading), + CapHeight: math.Round(subsetOTF.CapHeight.AsFloat(qv)), + XHeight: math.Round(subsetOTF.XHeight.AsFloat(qv)), + StemV: math.Round(outlines.Private[0].StdVW * qh), + StemH: math.Round(outlines.Private[0].StdHW * qv), + } + info := &cidfont.Type0Dict{ + Ref: f.ref, + PostScriptName: postScriptName, + SubsetTag: subsetTag, + Descriptor: fd, + ROS: ros, + Encoding: f.CMapNew(), + Width: ww, + DefaultWidth: dw, + Text: f.ToUnicodeNew(), + GetFont: func() (cidfont.Type0FontData, error) { + return subsetOTF, nil + }, + } + return info.WriteToPDF(rm) } diff --git a/font/opentype/cidcff_test.go b/font/opentype/cidcff_test.go index 98ca26af..42a124b2 100644 --- a/font/opentype/cidcff_test.go +++ b/font/opentype/cidcff_test.go @@ -15,100 +15,3 @@ // along with this program. If not, see . package opentype_test - -import ( - "testing" - "time" - - "github.com/google/go-cmp/cmp" - "seehuhn.de/go/pdf" - "seehuhn.de/go/pdf/font" - "seehuhn.de/go/pdf/font/charcode" - "seehuhn.de/go/pdf/font/cmap" - "seehuhn.de/go/pdf/font/opentype" - "seehuhn.de/go/pdf/internal/debug/makefont" - "seehuhn.de/go/pdf/internal/debug/memfile" - "seehuhn.de/go/postscript/cid" - "seehuhn.de/go/sfnt" - "seehuhn.de/go/sfnt/cff" -) - -func TestRoundTripCFFComposite(t *testing.T) { - otf := makefont.OpenType() - cs := charcode.CodeSpaceRange{ - {Low: []byte{0x04}, High: []byte{0x07}}, - {Low: []byte{0x10, 0x12}, High: []byte{0x11, 0x13}}, - } - ros := &cmap.CIDSystemInfo{ - Registry: "Test", - Ordering: "Sonderbar", - Supplement: 13, - } - cmapData := make(map[charcode.CharCodeOld]cid.CID, 8) - for code := charcode.CharCodeOld(0); code < 8; code++ { - cmapData[code] = cid.CID(2*code + 1) - } - cmapInfo := cmap.FromMapOld(ros, cs, cmapData) - m := make(map[charcode.CharCodeOld][]rune, 8) - for code := charcode.CharCodeOld(0); code < 8; code++ { - m[code] = []rune{'X', '0' + rune(code)} - } - toUnicode := cmap.NewToUnicode(cs, m) - info1 := &opentype.FontDictCFFComposite{ - Font: otf, - SubsetTag: "ABCDEF", - CMap: cmapInfo, - ToUnicode: toUnicode, - } - - rw, _ := memfile.NewPDFWriter(pdf.V1_7, nil) - ref := rw.Alloc() - err := info1.Embed(rw, ref) - if err != nil { - t.Fatal(err) - } - - dicts, err := font.ExtractDicts(rw, ref) - if err != nil { - t.Fatal(err) - } - info2, err := opentype.ExtractCFFComposite(rw, dicts) - if err != nil { - t.Fatal(err) - } - - // normalize the fonts before comparing them - for _, font := range []*sfnt.Font{info1.Font, info2.Font} { - // LineGap is stored in the "hmtx" and "OS/2" tables. - font.LineGap = 0 - - // Width is stored in the "OS/2" table. - font.Width = 0 - - // IsRegular is stored in the "OS/2" table. - font.IsRegular = false - - // CodePageRange is stored in the "OS/2" table. - font.CodePageRange = 0 - - // CreationTime and ModificationTime are stored in the "head" table. - font.CreationTime = time.Time{} - font.ModificationTime = time.Time{} - - // Description and License are stored in the "name" table. - font.Description = "" - font.License = "" - - // The floating point numbers in the glyphs may be represented differently. - // Let's hope the Glyphs are ok. - outlines := font.Outlines.(*cff.Outlines) - outlines.Glyphs = nil - - // Functions are difficult to compare. - outlines.FDSelect = nil - } - - if d := cmp.Diff(info1, info2); d != "" { - t.Errorf("info mismatch (-want +got):\n%s", d) - } -} diff --git a/font/opentype/font.go b/font/opentype/font.go index 2b4c2314..ffdc6082 100644 --- a/font/opentype/font.go +++ b/font/opentype/font.go @@ -170,11 +170,11 @@ func (f *Instance) Embed(rm *pdf.ResourceManager) (pdf.Native, font.Embedded, er } else { // glyf outlines if !opt.Composite { embedded = &embeddedGlyfSimple{ - w: w, - ref: ref, - sfnt: f.Font, - SimpleEncoder: encoding.NewSimpleEncoder(), - closed: false, + w: w, + ref: ref, + sfnt: f.Font, + TrueTypeEncoder: encoding.NewTrueTypeEncoder(), + closed: false, } } else { var gidToCID cmap.GIDToCID diff --git a/font/opentype/simplecff.go b/font/opentype/simplecff.go index 3017763a..2146572a 100644 --- a/font/opentype/simplecff.go +++ b/font/opentype/simplecff.go @@ -108,14 +108,15 @@ func (f *embeddedCFFSimple) Finish(rm *pdf.ResourceManager) error { } } - q := subsetSfnt.FontMatrix[3] * 1000 + qh := subsetSfnt.FontMatrix[0] * 1000 + qv := subsetSfnt.FontMatrix[3] * 1000 ascent := subsetSfnt.Ascent descent := subsetSfnt.Descent lineGap := subsetSfnt.LineGap var leadingPDF float64 if lineGap > 0 { - leadingPDF = (ascent - descent + lineGap).AsFloat(q) + leadingPDF = (ascent - descent + lineGap).AsFloat(qv) } fd := &font.Descriptor{ FontName: subset.Join(subsetTag, subsetCFF.FontName), @@ -130,13 +131,13 @@ func (f *embeddedCFFSimple) Finish(rm *pdf.ResourceManager) error { ForceBold: subsetCFF.Private[0].ForceBold, FontBBox: subsetSfnt.FontBBoxPDF().Rounded(), ItalicAngle: subsetSfnt.ItalicAngle, - Ascent: math.Round(ascent.AsFloat(q)), - Descent: math.Round(descent.AsFloat(q)), + Ascent: math.Round(ascent.AsFloat(qv)), + Descent: math.Round(descent.AsFloat(qv)), Leading: math.Round(leadingPDF), - CapHeight: math.Round(subsetSfnt.CapHeight.AsFloat(q)), - XHeight: math.Round(subsetSfnt.XHeight.AsFloat(q)), - StemV: subsetCFF.Private[0].StdVW, - StemH: subsetCFF.Private[0].StdHW, + CapHeight: math.Round(subsetSfnt.CapHeight.AsFloat(qv)), + XHeight: math.Round(subsetSfnt.XHeight.AsFloat(qv)), + StemV: math.Round(subsetCFF.Private[0].StdVW * qh), + StemH: math.Round(subsetCFF.Private[0].StdHW * qv), MissingWidth: subsetSfnt.GlyphWidthPDF(0), } res := &simple.Type1Dict{ diff --git a/font/opentype/simpleglyf.go b/font/opentype/simpleglyf.go index 7f5e2bd8..74d508ac 100644 --- a/font/opentype/simpleglyf.go +++ b/font/opentype/simpleglyf.go @@ -20,6 +20,7 @@ import ( "fmt" "math" + "seehuhn.de/go/postscript/type1/names" "seehuhn.de/go/sfnt" sfntcmap "seehuhn.de/go/sfnt/cmap" "seehuhn.de/go/sfnt/glyph" @@ -27,6 +28,7 @@ import ( "seehuhn.de/go/pdf" "seehuhn.de/go/pdf/font" "seehuhn.de/go/pdf/font/encoding" + "seehuhn.de/go/pdf/font/pdfenc" "seehuhn.de/go/pdf/font/simple" "seehuhn.de/go/pdf/font/subset" ) @@ -37,7 +39,7 @@ type embeddedGlyfSimple struct { sfnt *sfnt.Font - *encoding.SimpleEncoder + *encoding.TrueTypeEncoder closed bool } @@ -62,11 +64,11 @@ func (f *embeddedGlyfSimple) Finish(rm *pdf.ResourceManager) error { } f.closed = true - if f.SimpleEncoder.Overflow() { + if f.TrueTypeEncoder.Overflow() { return fmt.Errorf("too many distinct glyphs used in font %q", f.sfnt.PostScriptName()) } - enc := f.SimpleEncoder.Encoding + enc := f.TrueTypeEncoder.Encoding origSfnt := f.sfnt.Clone() origSfnt.CMapTable = nil @@ -75,7 +77,7 @@ func (f *embeddedGlyfSimple) Finish(rm *pdf.ResourceManager) error { origSfnt.Gpos = nil // subset the font - subsetGID := f.SimpleEncoder.Subset() + subsetGID := f.TrueTypeEncoder.Subset() subsetTag := subset.Tag(subsetGID, origSfnt.NumGlyphs()) subsetSfnt, err := origSfnt.Subset(subsetGID) if err != nil { @@ -91,26 +93,94 @@ func (f *embeddedGlyfSimple) Finish(rm *pdf.ResourceManager) error { subsetEncoding[i] = subsetGid[gid] } - // Mark the font as "symbolic", and use a (1, 0) "cmap" subtable to map - // character codes to glyphs. - // - // TODO(voss): also try the two allowed encodings for "non-symbolic" fonts. - // - // TODO(voss): revisit this, once - // https://github.com/pdf-association/pdf-issues/issues/316 is resolved. - isSymbolic := true - subtable := sfntcmap.Format4{} - for code, gid := range subsetEncoding { - if gid == 0 { + postScriptName := subsetSfnt.PostScriptName() + + // Follow the advice of section 9.6.5.4 of ISO 32000-2:2020: + // Only make the font as non-symbolic, if it can be encoded either + // using "MacRomanEncoding" or "WinAnsiEncoding". + var isSymbolic bool + var dictEnc encoding.Type1 + canMacRoman := true + canWinAnsi := true + var needsFormat12 bool + var text [256]string + f.TrueTypeEncoder.FillText(&text) + for code, s := range text { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { continue } - subtable[uint16(code)] = gid - } - subsetSfnt.CMapTable = sfntcmap.Table{ - {PlatformID: 1, EncodingID: 0}: subtable.Encode(0), + rr := []rune(s) + if len(rr) != 1 { + canMacRoman = false + canWinAnsi = false + break + } + r := rr[0] + if r >= 0x1_0000 { + needsFormat12 = true + } + glyphName := names.FromUnicode(r) + if pdfenc.MacRoman.Encoding[code] != glyphName { + canMacRoman = false + } + if pdfenc.WinAnsi.Encoding[code] != glyphName { + canWinAnsi = false + } + if !(canMacRoman || canWinAnsi) { + break + } } + if !(canMacRoman || canWinAnsi) { + // Mark the font as "symbolic", and use a (1, 0) "cmap" subtable to map + // character codes to glyphs. + isSymbolic = true + dictEnc = encoding.Builtin + + subtable := sfntcmap.Format4{} + for code, gid := range subsetEncoding { + if gid == 0 { + continue + } + subtable[uint16(code)] = gid + } + subsetSfnt.CMapTable = sfntcmap.Table{ + {PlatformID: 1, EncodingID: 0}: subtable.Encode(0), + } + } else { + isSymbolic = false + dictEnc = func(code byte) string { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { + return "" + } + return names.FromUnicode([]rune(text[code])[0]) + } - postScriptName := subsetSfnt.PostScriptName() + if needsFormat12 { + subtable := sfntcmap.Format12{} + for code, gid := range subsetEncoding { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { + continue + } + r := []rune(text[code])[0] + subtable[uint32(r)] = gid + } + subsetSfnt.CMapTable = sfntcmap.Table{ + {PlatformID: 3, EncodingID: 1}: subtable.Encode(0), + } + } else { + subtable := sfntcmap.Format4{} + for code, gid := range subsetEncoding { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { + continue + } + r := []rune(text[code])[0] + subtable[uint16(r)] = gid + } + subsetSfnt.CMapTable = sfntcmap.Table{ + {PlatformID: 3, EncodingID: 1}: subtable.Encode(0), + } + } + } q := subsetSfnt.FontMatrix[3] * 1000 @@ -146,7 +216,7 @@ func (f *embeddedGlyfSimple) Finish(rm *pdf.ResourceManager) error { PostScriptName: postScriptName, SubsetTag: subsetTag, Descriptor: fd, - Encoding: encoding.Builtin, + Encoding: dictEnc, IsOpenType: true, GetFont: func() (any, error) { return subsetSfnt, nil }, } @@ -154,7 +224,7 @@ func (f *embeddedGlyfSimple) Finish(rm *pdf.ResourceManager) error { gid := subsetEncoding[code] res.Width[code] = subsetSfnt.GlyphWidthPDF(gid) } - f.SimpleEncoder.FillText(&res.Text) + f.TrueTypeEncoder.FillText(&res.Text) return res.WriteToPDF(rm) } diff --git a/font/simple/type1.go b/font/simple/type1.go index c03cde27..ce2ee2da 100644 --- a/font/simple/type1.go +++ b/font/simple/type1.go @@ -348,7 +348,7 @@ func (d *Type1Dict) WriteToPDF(rm *pdf.ResourceManager) error { "Type": pdf.Name("Font"), "Subtype": pdf.Name("Type1"), "BaseFont": baseFont, - "XX_Seehuhn": pdf.Boolean(true), // TODO(voss): remove + "XX_Seehuhn": pdf.Name("Type1"), // TODO(voss): remove } if d.Name != "" { fontDict["Name"] = d.Name diff --git a/font/truetype/font.go b/font/truetype/font.go index fdbbcf21..9b24cf5a 100644 --- a/font/truetype/font.go +++ b/font/truetype/font.go @@ -148,10 +148,10 @@ func (f *Instance) Embed(rm *pdf.ResourceManager) (pdf.Native, font.Embedded, er } res = &embeddedSimple{ - w: w, - ref: ref, - sfnt: f.Font, - SimpleEncoder: encoding.NewSimpleEncoder(), + w: w, + ref: ref, + sfnt: f.Font, + TrueTypeEncoder: encoding.NewTrueTypeEncoder(), } } diff --git a/font/truetype/simple.go b/font/truetype/simple.go index 6eb7a1e6..4b3ab90f 100644 --- a/font/truetype/simple.go +++ b/font/truetype/simple.go @@ -21,7 +21,6 @@ import ( "math" "seehuhn.de/go/postscript/type1/names" - "seehuhn.de/go/sfnt" sfntcmap "seehuhn.de/go/sfnt/cmap" "seehuhn.de/go/sfnt/glyph" @@ -41,7 +40,7 @@ type embeddedSimple struct { sfnt *sfnt.Font - *encoding.SimpleEncoder + *encoding.TrueTypeEncoder closed bool } @@ -71,11 +70,11 @@ func (f *embeddedSimple) Finish(rm *pdf.ResourceManager) error { } f.closed = true - if f.SimpleEncoder.Overflow() { + if f.TrueTypeEncoder.Overflow() { return fmt.Errorf("too many distinct glyphs used in font %q", f.sfnt.PostScriptName()) } - enc := f.SimpleEncoder.Encoding + enc := f.TrueTypeEncoder.Encoding origSfnt := f.sfnt.Clone() origSfnt.CMapTable = nil @@ -84,7 +83,7 @@ func (f *embeddedSimple) Finish(rm *pdf.ResourceManager) error { origSfnt.Gpos = nil // subset the font - subsetGID := f.SimpleEncoder.Subset() + subsetGID := f.TrueTypeEncoder.Subset() subsetTag := subset.Tag(subsetGID, origSfnt.NumGlyphs()) subsetSfnt, err := origSfnt.Subset(subsetGID) if err != nil { @@ -100,26 +99,94 @@ func (f *embeddedSimple) Finish(rm *pdf.ResourceManager) error { subsetEncoding[i] = subsetGid[gid] } - // Mark the font as "symbolic", and use a (1, 0) "cmap" subtable to map - // character codes to glyphs. - // - // TODO(voss): also try the two allowed encodings for "non-symbolic" fonts. - // - // TODO(voss): revisit this, once - // https://github.com/pdf-association/pdf-issues/issues/316 is resolved. - isSymbolic := true - subtable := sfntcmap.Format4{} - for code, gid := range subsetEncoding { - if gid == 0 { + postScriptName := subsetSfnt.PostScriptName() + + // Follow the advice of section 9.6.5.4 of ISO 32000-2:2020: + // Only make the font as non-symbolic, if it can be encoded either + // using "MacRomanEncoding" or "WinAnsiEncoding". + var isSymbolic bool + var dictEnc encoding.Type1 + canMacRoman := true + canWinAnsi := true + var needsFormat12 bool + var text [256]string + f.TrueTypeEncoder.FillText(&text) + for code, s := range text { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { continue } - subtable[uint16(code)] = gid - } - subsetSfnt.CMapTable = sfntcmap.Table{ - {PlatformID: 1, EncodingID: 0}: subtable.Encode(0), + rr := []rune(s) + if len(rr) != 1 { + canMacRoman = false + canWinAnsi = false + break + } + r := rr[0] + if r >= 0x1_0000 { + needsFormat12 = true + } + glyphName := names.FromUnicode(r) + if pdfenc.MacRoman.Encoding[code] != glyphName { + canMacRoman = false + } + if pdfenc.WinAnsi.Encoding[code] != glyphName { + canWinAnsi = false + } + if !(canMacRoman || canWinAnsi) { + break + } } + if !(canMacRoman || canWinAnsi) { + // Mark the font as "symbolic", and use a (1, 0) "cmap" subtable to map + // character codes to glyphs. + isSymbolic = true + dictEnc = encoding.Builtin + + subtable := sfntcmap.Format4{} + for code, gid := range subsetEncoding { + if gid == 0 { + continue + } + subtable[uint16(code)] = gid + } + subsetSfnt.CMapTable = sfntcmap.Table{ + {PlatformID: 1, EncodingID: 0}: subtable.Encode(0), + } + } else { + isSymbolic = false + dictEnc = func(code byte) string { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { + return "" + } + return names.FromUnicode([]rune(text[code])[0]) + } - postScriptName := subsetSfnt.PostScriptName() + if needsFormat12 { + subtable := sfntcmap.Format12{} + for code, gid := range subsetEncoding { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { + continue + } + r := []rune(text[code])[0] + subtable[uint32(r)] = gid + } + subsetSfnt.CMapTable = sfntcmap.Table{ + {PlatformID: 3, EncodingID: 1}: subtable.Encode(0), + } + } else { + subtable := sfntcmap.Format4{} + for code, gid := range subsetEncoding { + if !f.TrueTypeEncoder.CodeIsUsed(byte(code)) { + continue + } + r := []rune(text[code])[0] + subtable[uint16(r)] = gid + } + subsetSfnt.CMapTable = sfntcmap.Table{ + {PlatformID: 3, EncodingID: 1}: subtable.Encode(0), + } + } + } q := subsetSfnt.FontMatrix[3] * 1000 @@ -155,7 +222,7 @@ func (f *embeddedSimple) Finish(rm *pdf.ResourceManager) error { PostScriptName: postScriptName, SubsetTag: subsetTag, Descriptor: fd, - Encoding: encoding.Builtin, + Encoding: dictEnc, IsOpenType: false, GetFont: func() (any, error) { return subsetSfnt, nil }, } @@ -163,67 +230,7 @@ func (f *embeddedSimple) Finish(rm *pdf.ResourceManager) error { gid := subsetEncoding[code] res.Width[code] = subsetSfnt.GlyphWidthPDF(gid) } - f.SimpleEncoder.FillText(&res.Text) + f.TrueTypeEncoder.FillText(&res.Text) return res.WriteToPDF(rm) } - -// ExtractEncoding tries to extract an encoding vector from the given encoding -// dictionary. See section 9.6.5.4 of ISO 32000-2:2020. -// -// TODO(voss): revisit this, once -// https://github.com/pdf-association/pdf-issues/issues/316 is resolved. -func ExtractEncoding(r pdf.Getter, encodingDict pdf.Object, ttf *sfnt.Font) []glyph.ID { - if encodingEntry, _ := pdf.Resolve(r, encodingDict); encodingEntry != nil { - encodingNames, _ := encoding.UndescribeEncodingType1(r, encodingEntry, pdfenc.Standard.Encoding[:]) - for i, name := range encodingNames { - if name == ".notdef" { - encodingNames[i] = pdfenc.Standard.Encoding[i] - } - } - - cmap, _ := ttf.CMapTable.GetNoLang(3, 1) - if cmap != nil { - encoding := make([]glyph.ID, 256) - for code, name := range encodingNames { - rr := names.ToUnicode(name, false) - if len(rr) == 1 { - encoding[code] = cmap.Lookup(rr[0]) - } - } - return encoding - } - // TODO(voss): also try to use a (1,0) subtable together with encodingNames - } - - cmap, _ := ttf.CMapTable.GetNoLang(3, 0) - if cmap != nil { - encoding := make([]glyph.ID, 256) - for code := rune(0); code < 256; code++ { - for _, pfx := range []rune{0xF000, 0xF100, 0xF200, 0x0000} { - if cmap.Lookup(pfx+code) != 0 { - encoding[code] = cmap.Lookup(pfx | code) - break - } - } - } - return encoding - } - - cmap, _ = ttf.CMapTable.GetNoLang(1, 0) - if cmap != nil { - encoding := make([]glyph.ID, 256) - for code := rune(0); code < 256; code++ { - encoding[code] = cmap.Lookup(code) - } - return encoding - } - - // encoding := make([]glyph.ID, 256) - // for i := range encoding { - // encoding[i] = glyph.ID(i) - // } - // return encoding - - return nil -} diff --git a/font/type3/font.go b/font/type3/font.go index cebfbaf8..e3829d01 100644 --- a/font/type3/font.go +++ b/font/type3/font.go @@ -197,10 +197,10 @@ func (f *embedded) Finish(*pdf.ResourceManager) error { CharProcs: glyphs, Encoding: func(code byte) string { return encoding[code] }, Descriptor: fd, - Width: [256]float64{}, Resources: f.Resources, } copy(res.Width[:], widths) + f.SimpleEncoder.FillText(&res.Text) return res.WriteToPDF(f.RM) } diff --git a/reader/font.go b/reader/font.go index af498735..9055093b 100644 --- a/reader/font.go +++ b/reader/font.go @@ -69,7 +69,7 @@ func (r *Reader) ReadFont(ref pdf.Object) (F FontFromFile, err error) { } func (r *Reader) readSimpleFont(info *font.Dicts, toUni *cmap.ToUnicodeFile) (F FontFromFile, err error) { - var enc *encoding.Encoding + var enc *encoding.EncodingOld switch info.DictType { case font.DictTypeSimpleType1: enc, err = encoding.ExtractType1Old(r.R, info) @@ -161,7 +161,7 @@ func (r *Reader) extractWidths(info *font.Dicts) ([]float64, error) { } type SimpleFont struct { - enc *encoding.Encoding + enc *encoding.EncodingOld info []*font.CodeInfo widths []float64 toUni *cmap.ToUnicodeFile