pipeline.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. // Copyright 2017 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package pipeline provides tools for creating translation pipelines.
  5. //
  6. // NOTE: UNDER DEVELOPMENT. API MAY CHANGE.
  7. package pipeline
  8. import (
  9. "bytes"
  10. "encoding/json"
  11. "fmt"
  12. "go/build"
  13. "go/parser"
  14. "io/ioutil"
  15. "log"
  16. "os"
  17. "path/filepath"
  18. "regexp"
  19. "strings"
  20. "text/template"
  21. "unicode"
  22. "golang.org/x/text/internal"
  23. "golang.org/x/text/language"
  24. "golang.org/x/text/runes"
  25. "golang.org/x/tools/go/loader"
  26. )
  27. const (
  28. extractFile = "extracted.gotext.json"
  29. outFile = "out.gotext.json"
  30. gotextSuffix = "gotext.json"
  31. )
  32. // Config contains configuration for the translation pipeline.
  33. type Config struct {
  34. // Supported indicates the languages for which data should be generated.
  35. // The default is to support all locales for which there are matching
  36. // translation files.
  37. Supported []language.Tag
  38. // --- Extraction
  39. SourceLanguage language.Tag
  40. Packages []string
  41. // --- File structure
  42. // Dir is the root dir for all operations.
  43. Dir string
  44. // TranslationsPattern is a regular expression to match incoming translation
  45. // files. These files may appear in any directory rooted at Dir.
  46. // language for the translation files is determined as follows:
  47. // 1. From the Language field in the file.
  48. // 2. If not present, from a valid language tag in the filename, separated
  49. // by dots (e.g. "en-US.json" or "incoming.pt_PT.xmb").
  50. // 3. If not present, from a the closest subdirectory in which the file
  51. // is contained that parses as a valid language tag.
  52. TranslationsPattern string
  53. // OutPattern defines the location for translation files for a certain
  54. // language. The default is "{{.Dir}}/{{.Language}}/out.{{.Ext}}"
  55. OutPattern string
  56. // Format defines the file format for generated translation files.
  57. // The default is XMB. Alternatives are GetText, XLIFF, L20n, GoText.
  58. Format string
  59. Ext string
  60. // TODO:
  61. // Actions are additional actions to be performed after the initial extract
  62. // and merge.
  63. // Actions []struct {
  64. // Name string
  65. // Options map[string]string
  66. // }
  67. // --- Generation
  68. // GenFile may be in a different package. It is not defined, it will
  69. // be written to stdout.
  70. GenFile string
  71. // GenPackage is the package or relative path into which to generate the
  72. // file. If not specified it is relative to the current directory.
  73. GenPackage string
  74. // DeclareVar defines a variable to which to assing the generated Catalog.
  75. DeclareVar string
  76. // SetDefault determines whether to assign the generated Catalog to
  77. // message.DefaultCatalog. The default for this is true if DeclareVar is
  78. // not defined, false otherwise.
  79. SetDefault bool
  80. // TODO:
  81. // - Printf-style configuration
  82. // - Template-style configuration
  83. // - Extraction options
  84. // - Rewrite options
  85. // - Generation options
  86. }
  87. // Operations:
  88. // - extract: get the strings
  89. // - disambiguate: find messages with the same key, but possible different meaning.
  90. // - create out: create a list of messages that need translations
  91. // - load trans: load the list of current translations
  92. // - merge: assign list of translations as done
  93. // - (action)expand: analyze features and create example sentences for each version.
  94. // - (action)googletrans: pre-populate messages with automatic translations.
  95. // - (action)export: send out messages somewhere non-standard
  96. // - (action)import: load messages from somewhere non-standard
  97. // - vet program: don't pass "foo" + var + "bar" strings. Not using funcs for translated strings.
  98. // - vet trans: coverage: all translations/ all features.
  99. // - generate: generate Go code
  100. // State holds all accumulated information on translations during processing.
  101. type State struct {
  102. Config Config
  103. Package string
  104. program *loader.Program
  105. Extracted Messages `json:"messages"`
  106. // Messages includes all messages for which there need to be translations.
  107. // Duplicates may be eliminated. Generation will be done from these messages
  108. // (usually after merging).
  109. Messages []Messages
  110. // Translations are incoming translations for the application messages.
  111. Translations []Messages
  112. }
  113. func (s *State) dir() string {
  114. if d := s.Config.Dir; d != "" {
  115. return d
  116. }
  117. return "./locales"
  118. }
  119. func outPattern(s *State) (string, error) {
  120. c := s.Config
  121. pat := c.OutPattern
  122. if pat == "" {
  123. pat = "{{.Dir}}/{{.Language}}/out.{{.Ext}}"
  124. }
  125. ext := c.Ext
  126. if ext == "" {
  127. ext = c.Format
  128. }
  129. if ext == "" {
  130. ext = gotextSuffix
  131. }
  132. t, err := template.New("").Parse(pat)
  133. if err != nil {
  134. return "", wrap(err, "error parsing template")
  135. }
  136. buf := bytes.Buffer{}
  137. err = t.Execute(&buf, map[string]string{
  138. "Dir": s.dir(),
  139. "Language": "%s",
  140. "Ext": ext,
  141. })
  142. return filepath.FromSlash(buf.String()), wrap(err, "incorrect OutPattern")
  143. }
  144. var transRE = regexp.MustCompile(`.*\.` + gotextSuffix)
  145. // Import loads existing translation files.
  146. func (s *State) Import() error {
  147. outPattern, err := outPattern(s)
  148. if err != nil {
  149. return err
  150. }
  151. re := transRE
  152. if pat := s.Config.TranslationsPattern; pat != "" {
  153. if re, err = regexp.Compile(pat); err != nil {
  154. return wrapf(err, "error parsing regexp %q", s.Config.TranslationsPattern)
  155. }
  156. }
  157. x := importer{s, outPattern, re}
  158. return x.walkImport(s.dir(), s.Config.SourceLanguage)
  159. }
  160. type importer struct {
  161. state *State
  162. outPattern string
  163. transFile *regexp.Regexp
  164. }
  165. func (i *importer) walkImport(path string, tag language.Tag) error {
  166. files, err := ioutil.ReadDir(path)
  167. if err != nil {
  168. return nil
  169. }
  170. for _, f := range files {
  171. name := f.Name()
  172. tag := tag
  173. if f.IsDir() {
  174. if t, err := language.Parse(name); err == nil {
  175. tag = t
  176. }
  177. // We ignore errors
  178. if err := i.walkImport(filepath.Join(path, name), tag); err != nil {
  179. return err
  180. }
  181. continue
  182. }
  183. for _, l := range strings.Split(name, ".") {
  184. if t, err := language.Parse(l); err == nil {
  185. tag = t
  186. }
  187. }
  188. file := filepath.Join(path, name)
  189. // TODO: Should we skip files that match output files?
  190. if fmt.Sprintf(i.outPattern, tag) == file {
  191. continue
  192. }
  193. // TODO: handle different file formats.
  194. if !i.transFile.MatchString(name) {
  195. continue
  196. }
  197. b, err := ioutil.ReadFile(file)
  198. if err != nil {
  199. return wrap(err, "read file failed")
  200. }
  201. var translations Messages
  202. if err := json.Unmarshal(b, &translations); err != nil {
  203. return wrap(err, "parsing translation file failed")
  204. }
  205. i.state.Translations = append(i.state.Translations, translations)
  206. }
  207. return nil
  208. }
  209. // Merge merges the extracted messages with the existing translations.
  210. func (s *State) Merge() error {
  211. if s.Messages != nil {
  212. panic("already merged")
  213. }
  214. // Create an index for each unique message.
  215. // Duplicates are okay as long as the substitution arguments are okay as
  216. // well.
  217. // Top-level messages are okay to appear in multiple substitution points.
  218. // Collect key equivalence.
  219. msgs := []*Message{}
  220. keyToIDs := map[string]*Message{}
  221. for _, m := range s.Extracted.Messages {
  222. m := m
  223. if prev, ok := keyToIDs[m.Key]; ok {
  224. if err := checkEquivalence(&m, prev); err != nil {
  225. warnf("Key %q matches conflicting messages: %v and %v", m.Key, prev.ID, m.ID)
  226. // TODO: track enough information so that the rewriter can
  227. // suggest/disambiguate messages.
  228. }
  229. // TODO: add position to message.
  230. continue
  231. }
  232. i := len(msgs)
  233. msgs = append(msgs, &m)
  234. keyToIDs[m.Key] = msgs[i]
  235. }
  236. // Messages with different keys may still refer to the same translated
  237. // message (e.g. different whitespace). Filter these.
  238. idMap := map[string]bool{}
  239. filtered := []*Message{}
  240. for _, m := range msgs {
  241. found := false
  242. for _, id := range m.ID {
  243. found = found || idMap[id]
  244. }
  245. if !found {
  246. filtered = append(filtered, m)
  247. }
  248. for _, id := range m.ID {
  249. idMap[id] = true
  250. }
  251. }
  252. // Build index of translations.
  253. translations := map[language.Tag]map[string]Message{}
  254. languages := append([]language.Tag{}, s.Config.Supported...)
  255. for _, t := range s.Translations {
  256. tag := t.Language
  257. if _, ok := translations[tag]; !ok {
  258. translations[tag] = map[string]Message{}
  259. languages = append(languages, tag)
  260. }
  261. for _, m := range t.Messages {
  262. if !m.Translation.IsEmpty() {
  263. for _, id := range m.ID {
  264. if _, ok := translations[tag][id]; ok {
  265. warnf("Duplicate translation in locale %q for message %q", tag, id)
  266. }
  267. translations[tag][id] = m
  268. }
  269. }
  270. }
  271. }
  272. languages = internal.UniqueTags(languages)
  273. for _, tag := range languages {
  274. ms := Messages{Language: tag}
  275. for _, orig := range filtered {
  276. m := *orig
  277. m.Key = ""
  278. m.Position = ""
  279. for _, id := range m.ID {
  280. if t, ok := translations[tag][id]; ok {
  281. m.Translation = t.Translation
  282. if t.TranslatorComment != "" {
  283. m.TranslatorComment = t.TranslatorComment
  284. m.Fuzzy = t.Fuzzy
  285. }
  286. break
  287. }
  288. }
  289. if tag == s.Config.SourceLanguage && m.Translation.IsEmpty() {
  290. m.Translation = m.Message
  291. if m.TranslatorComment == "" {
  292. m.TranslatorComment = "Copied from source."
  293. m.Fuzzy = true
  294. }
  295. }
  296. // TODO: if translation is empty: pre-expand based on available
  297. // linguistic features. This may also be done as a plugin.
  298. ms.Messages = append(ms.Messages, m)
  299. }
  300. s.Messages = append(s.Messages, ms)
  301. }
  302. return nil
  303. }
  304. // Export writes out the messages to translation out files.
  305. func (s *State) Export() error {
  306. path, err := outPattern(s)
  307. if err != nil {
  308. return wrap(err, "export failed")
  309. }
  310. for _, out := range s.Messages {
  311. // TODO: inject translations from existing files to avoid retranslation.
  312. data, err := json.MarshalIndent(out, "", " ")
  313. if err != nil {
  314. return wrap(err, "JSON marshal failed")
  315. }
  316. file := fmt.Sprintf(path, out.Language)
  317. if err := os.MkdirAll(filepath.Dir(file), 0755); err != nil {
  318. return wrap(err, "dir create failed")
  319. }
  320. if err := ioutil.WriteFile(file, data, 0644); err != nil {
  321. return wrap(err, "write failed")
  322. }
  323. }
  324. return nil
  325. }
  326. var (
  327. ws = runes.In(unicode.White_Space).Contains
  328. notWS = runes.NotIn(unicode.White_Space).Contains
  329. )
  330. func trimWS(s string) (trimmed, leadWS, trailWS string) {
  331. trimmed = strings.TrimRightFunc(s, ws)
  332. trailWS = s[len(trimmed):]
  333. if i := strings.IndexFunc(trimmed, notWS); i > 0 {
  334. leadWS = trimmed[:i]
  335. trimmed = trimmed[i:]
  336. }
  337. return trimmed, leadWS, trailWS
  338. }
  339. // NOTE: The command line tool already prefixes with "gotext:".
  340. var (
  341. wrap = func(err error, msg string) error {
  342. if err == nil {
  343. return nil
  344. }
  345. return fmt.Errorf("%s: %v", msg, err)
  346. }
  347. wrapf = func(err error, msg string, args ...interface{}) error {
  348. if err == nil {
  349. return nil
  350. }
  351. return wrap(err, fmt.Sprintf(msg, args...))
  352. }
  353. errorf = fmt.Errorf
  354. )
  355. func warnf(format string, args ...interface{}) {
  356. // TODO: don't log.
  357. log.Printf(format, args...)
  358. }
  359. func loadPackages(conf *loader.Config, args []string) (*loader.Program, error) {
  360. if len(args) == 0 {
  361. args = []string{"."}
  362. }
  363. conf.Build = &build.Default
  364. conf.ParserMode = parser.ParseComments
  365. // Use the initial packages from the command line.
  366. args, err := conf.FromArgs(args, false)
  367. if err != nil {
  368. return nil, wrap(err, "loading packages failed")
  369. }
  370. // Load, parse and type-check the whole program.
  371. return conf.Load()
  372. }