소스 검색

Merge branch 'NRGBA_support'

This adds optimized access to image.NRGBA, image.NRGBA64 types. Internally these
image types are now preferred as they are easier to handle if alpha channels are
used.
nfnt 10 년 전
부모
커밋
3502bb8cd1
4개의 변경된 파일311개의 추가작업 그리고 20개의 파일을 삭제
  1. 1 1
      README.md
  2. 91 3
      converter.go
  3. 86 0
      nearest.go
  4. 133 16
      resize.go

+ 1 - 1
README.md

@@ -90,7 +90,7 @@ func main() {
 Caveats
 Caveats
 -------
 -------
 
 
-* Optimized access routines are used for `image.RGBA`, `image.RGBA64`, `image.YCbCr`, `image.Gray`, and `image.Gray16` types. All other image types are accessed in a generic way that will result in slow processing speed.
+* Optimized access routines are used for `image.RGBA`, `image.NRGBA`, `image.RGBA64`, `image.NRGBA64`, `image.YCbCr`, `image.Gray`, and `image.Gray16` types. All other image types are accessed in a generic way that will result in slow processing speed.
 * JPEG images are stored in `image.YCbCr`. This image format stores data in a way that will decrease processing speed. A resize may be up to 2 times slower than with `image.RGBA`. 
 * JPEG images are stored in `image.YCbCr`. This image format stores data in a way that will decrease processing speed. A resize may be up to 2 times slower than with `image.RGBA`. 
 
 
 
 

+ 91 - 3
converter.go

@@ -43,7 +43,7 @@ func clampUint16(in int64) uint16 {
 	return 0
 	return 0
 }
 }
 
 
-func resizeGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
+func resizeGeneric(in image.Image, out *image.NRGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
 	newBounds := out.Bounds()
 	newBounds := out.Bounds()
 	maxX := in.Bounds().Dx() - 1
 	maxX := in.Bounds().Dx() - 1
 
 
@@ -89,7 +89,7 @@ func resizeGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []in
 	}
 	}
 }
 }
 
 
-func resizeRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []int16, offset []int, filterLength int) {
+func resizeRGBA(in *image.RGBA, out *image.NRGBA, scale float64, coeffs []int16, offset []int, filterLength int) {
 	newBounds := out.Bounds()
 	newBounds := out.Bounds()
 	maxX := in.Bounds().Dx() - 1
 	maxX := in.Bounds().Dx() - 1
 
 
@@ -129,7 +129,95 @@ func resizeRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []int16,
 	}
 	}
 }
 }
 
 
-func resizeRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
+func resizeNRGBA(in *image.NRGBA, out *image.NRGBA, scale float64, coeffs []int16, offset []int, filterLength int) {
+	newBounds := out.Bounds()
+	maxX := in.Bounds().Dx() - 1
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[x*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]int32
+			var sum int32
+			start := offset[y]
+			ci := y * filterLength
+			for i := 0; i < filterLength; i++ {
+				coeff := coeffs[ci+i]
+				if coeff != 0 {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(maxX):
+						xi *= 4
+					case xi >= maxX:
+						xi = 4 * maxX
+					default:
+						xi = 0
+					}
+					rgba[0] += int32(coeff) * int32(row[xi+0])
+					rgba[1] += int32(coeff) * int32(row[xi+1])
+					rgba[2] += int32(coeff) * int32(row[xi+2])
+					rgba[3] += int32(coeff) * int32(row[xi+3])
+					sum += int32(coeff)
+				}
+			}
+
+			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
+			out.Pix[xo+0] = clampUint8(rgba[0] / sum)
+			out.Pix[xo+1] = clampUint8(rgba[1] / sum)
+			out.Pix[xo+2] = clampUint8(rgba[2] / sum)
+			out.Pix[xo+3] = clampUint8(rgba[3] / sum)
+		}
+	}
+}
+
+func resizeRGBA64(in *image.RGBA64, out *image.NRGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
+	newBounds := out.Bounds()
+	maxX := in.Bounds().Dx() - 1
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[x*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]int64
+			var sum int64
+			start := offset[y]
+			ci := y * filterLength
+			for i := 0; i < filterLength; i++ {
+				coeff := coeffs[ci+i]
+				if coeff != 0 {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(maxX):
+						xi *= 8
+					case xi >= maxX:
+						xi = 8 * maxX
+					default:
+						xi = 0
+					}
+					rgba[0] += int64(coeff) * int64(uint16(row[xi+0])<<8|uint16(row[xi+1]))
+					rgba[1] += int64(coeff) * int64(uint16(row[xi+2])<<8|uint16(row[xi+3]))
+					rgba[2] += int64(coeff) * int64(uint16(row[xi+4])<<8|uint16(row[xi+5]))
+					rgba[3] += int64(coeff) * int64(uint16(row[xi+6])<<8|uint16(row[xi+7]))
+					sum += int64(coeff)
+				}
+			}
+
+			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
+			value := clampUint16(rgba[0] / sum)
+			out.Pix[xo+0] = uint8(value >> 8)
+			out.Pix[xo+1] = uint8(value)
+			value = clampUint16(rgba[1] / sum)
+			out.Pix[xo+2] = uint8(value >> 8)
+			out.Pix[xo+3] = uint8(value)
+			value = clampUint16(rgba[2] / sum)
+			out.Pix[xo+4] = uint8(value >> 8)
+			out.Pix[xo+5] = uint8(value)
+			value = clampUint16(rgba[3] / sum)
+			out.Pix[xo+6] = uint8(value >> 8)
+			out.Pix[xo+7] = uint8(value)
+		}
+	}
+}
+
+func resizeNRGBA64(in *image.NRGBA64, out *image.NRGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
 	newBounds := out.Bounds()
 	newBounds := out.Bounds()
 	maxX := in.Bounds().Dx() - 1
 	maxX := in.Bounds().Dx() - 1
 
 

+ 86 - 0
nearest.go

@@ -118,6 +118,45 @@ func nearestRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []bool,
 	}
 	}
 }
 }
 
 
+func nearestNRGBA(in *image.NRGBA, out *image.NRGBA, scale float64, coeffs []bool, offset []int, filterLength int) {
+	newBounds := out.Bounds()
+	maxX := in.Bounds().Dx() - 1
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[x*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]float32
+			var sum float32
+			start := offset[y]
+			ci := y * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(maxX):
+						xi *= 4
+					case xi >= maxX:
+						xi = 4 * maxX
+					default:
+						xi = 0
+					}
+					rgba[0] += float32(row[xi+0])
+					rgba[1] += float32(row[xi+1])
+					rgba[2] += float32(row[xi+2])
+					rgba[3] += float32(row[xi+3])
+					sum++
+				}
+			}
+
+			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
+			out.Pix[xo+0] = floatToUint8(rgba[0] / sum)
+			out.Pix[xo+1] = floatToUint8(rgba[1] / sum)
+			out.Pix[xo+2] = floatToUint8(rgba[2] / sum)
+			out.Pix[xo+3] = floatToUint8(rgba[3] / sum)
+		}
+	}
+}
+
 func nearestRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
 func nearestRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
 	newBounds := out.Bounds()
 	newBounds := out.Bounds()
 	maxX := in.Bounds().Dx() - 1
 	maxX := in.Bounds().Dx() - 1
@@ -165,6 +204,53 @@ func nearestRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []
 	}
 	}
 }
 }
 
 
+func nearestNRGBA64(in *image.NRGBA64, out *image.NRGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
+	newBounds := out.Bounds()
+	maxX := in.Bounds().Dx() - 1
+
+	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
+		row := in.Pix[x*in.Stride:]
+		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
+			var rgba [4]float32
+			var sum float32
+			start := offset[y]
+			ci := y * filterLength
+			for i := 0; i < filterLength; i++ {
+				if coeffs[ci+i] {
+					xi := start + i
+					switch {
+					case uint(xi) < uint(maxX):
+						xi *= 8
+					case xi >= maxX:
+						xi = 8 * maxX
+					default:
+						xi = 0
+					}
+					rgba[0] += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
+					rgba[1] += float32(uint16(row[xi+2])<<8 | uint16(row[xi+3]))
+					rgba[2] += float32(uint16(row[xi+4])<<8 | uint16(row[xi+5]))
+					rgba[3] += float32(uint16(row[xi+6])<<8 | uint16(row[xi+7]))
+					sum++
+				}
+			}
+
+			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
+			value := floatToUint16(rgba[0] / sum)
+			out.Pix[xo+0] = uint8(value >> 8)
+			out.Pix[xo+1] = uint8(value)
+			value = floatToUint16(rgba[1] / sum)
+			out.Pix[xo+2] = uint8(value >> 8)
+			out.Pix[xo+3] = uint8(value)
+			value = floatToUint16(rgba[2] / sum)
+			out.Pix[xo+4] = uint8(value >> 8)
+			out.Pix[xo+5] = uint8(value)
+			value = floatToUint16(rgba[3] / sum)
+			out.Pix[xo+6] = uint8(value >> 8)
+			out.Pix[xo+7] = uint8(value)
+		}
+	}
+}
+
 func nearestGray(in *image.Gray, out *image.Gray, scale float64, coeffs []bool, offset []int, filterLength int) {
 func nearestGray(in *image.Gray, out *image.Gray, scale float64, coeffs []bool, offset []int, filterLength int) {
 	newBounds := out.Bounds()
 	newBounds := out.Bounds()
 	maxX := in.Bounds().Dx() - 1
 	maxX := in.Bounds().Dx() - 1

+ 133 - 16
resize.go

@@ -105,14 +105,14 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 	switch input := img.(type) {
 	switch input := img.(type) {
 	case *image.RGBA:
 	case *image.RGBA:
 		// 8-bit precision
 		// 8-bit precision
-		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
+		temp := image.NewNRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA(image.Rect(0, 0, int(width), int(height)))
 
 
 		// horizontal filter, results in transposed temporary image
 		// horizontal filter, results in transposed temporary image
 		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
 		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
 		wg.Add(cpus)
 		wg.Add(cpus)
 		for i := 0; i < cpus; i++ {
 		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA)
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
 				resizeRGBA(input, slice, scaleX, coeffs, offset, filterLength)
 				resizeRGBA(input, slice, scaleX, coeffs, offset, filterLength)
@@ -124,14 +124,44 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
 		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
 		wg.Add(cpus)
 		wg.Add(cpus)
 		for i := 0; i < cpus; i++ {
 		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA)
+			slice := makeSlice(result, i, cpus).(*image.NRGBA)
+			go func() {
+				defer wg.Done()
+				resizeNRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.NRGBA:
+		// 8-bit precision
+		temp := image.NewNRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA)
+			go func() {
+				defer wg.Done()
+				resizeNRGBA(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.NRGBA)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
-				resizeRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
+				resizeNRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
 			}()
 			}()
 		}
 		}
 		wg.Wait()
 		wg.Wait()
 		return result
 		return result
+
 	case *image.YCbCr:
 	case *image.YCbCr:
 		// 8-bit precision
 		// 8-bit precision
 		// accessing the YCbCr arrays in a tight loop is slow.
 		// accessing the YCbCr arrays in a tight loop is slow.
@@ -164,14 +194,14 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 		return result.YCbCr()
 		return result.YCbCr()
 	case *image.RGBA64:
 	case *image.RGBA64:
 		// 16-bit precision
 		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
+		temp := image.NewNRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA64(image.Rect(0, 0, int(width), int(height)))
 
 
 		// horizontal filter, results in transposed temporary image
 		// horizontal filter, results in transposed temporary image
 		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
 		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
 		wg.Add(cpus)
 		wg.Add(cpus)
 		for i := 0; i < cpus; i++ {
 		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA64)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
 				resizeRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
 				resizeRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
@@ -183,10 +213,39 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
 		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
 		wg.Add(cpus)
 		wg.Add(cpus)
 		for i := 0; i < cpus; i++ {
 		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
+			slice := makeSlice(result, i, cpus).(*image.NRGBA64)
+			go func() {
+				defer wg.Done()
+				resizeNRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.NRGBA64:
+		// 16-bit precision
+		temp := image.NewNRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA64(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA64)
+			go func() {
+				defer wg.Done()
+				resizeNRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.NRGBA64)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
-				resizeGeneric(temp, slice, scaleY, coeffs, offset, filterLength)
+				resizeNRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
 			}()
 			}()
 		}
 		}
 		wg.Wait()
 		wg.Wait()
@@ -251,14 +310,14 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 		return result
 		return result
 	default:
 	default:
 		// 16-bit precision
 		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, img.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
+		temp := image.NewNRGBA64(image.Rect(0, 0, img.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA64(image.Rect(0, 0, int(width), int(height)))
 
 
 		// horizontal filter, results in transposed temporary image
 		// horizontal filter, results in transposed temporary image
 		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
 		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
 		wg.Add(cpus)
 		wg.Add(cpus)
 		for i := 0; i < cpus; i++ {
 		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA64)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
 				resizeGeneric(img, slice, scaleX, coeffs, offset, filterLength)
 				resizeGeneric(img, slice, scaleX, coeffs, offset, filterLength)
@@ -270,10 +329,10 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
 		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
 		wg.Add(cpus)
 		wg.Add(cpus)
 		for i := 0; i < cpus; i++ {
 		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
+			slice := makeSlice(result, i, cpus).(*image.NRGBA64)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
-				resizeRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
+				resizeNRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
 			}()
 			}()
 		}
 		}
 		wg.Wait()
 		wg.Wait()
@@ -316,6 +375,35 @@ func resizeNearest(width, height uint, scaleX, scaleY float64, img image.Image,
 		}
 		}
 		wg.Wait()
 		wg.Wait()
 		return result
 		return result
+	case *image.NRGBA:
+		// 8-bit precision
+		temp := image.NewNRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA)
+			go func() {
+				defer wg.Done()
+				nearestNRGBA(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.NRGBA)
+			go func() {
+				defer wg.Done()
+				nearestNRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
 	case *image.YCbCr:
 	case *image.YCbCr:
 		// 8-bit precision
 		// 8-bit precision
 		// accessing the YCbCr arrays in a tight loop is slow.
 		// accessing the YCbCr arrays in a tight loop is slow.
@@ -370,7 +458,36 @@ func resizeNearest(width, height uint, scaleX, scaleY float64, img image.Image,
 			slice := makeSlice(result, i, cpus).(*image.RGBA64)
 			slice := makeSlice(result, i, cpus).(*image.RGBA64)
 			go func() {
 			go func() {
 				defer wg.Done()
 				defer wg.Done()
-				nearestGeneric(temp, slice, scaleY, coeffs, offset, filterLength)
+				nearestRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+		return result
+	case *image.NRGBA64:
+		// 16-bit precision
+		temp := image.NewNRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
+		result := image.NewNRGBA64(image.Rect(0, 0, int(width), int(height)))
+
+		// horizontal filter, results in transposed temporary image
+		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(temp, i, cpus).(*image.NRGBA64)
+			go func() {
+				defer wg.Done()
+				nearestNRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
+			}()
+		}
+		wg.Wait()
+
+		// horizontal filter on transposed image, result is not transposed
+		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
+		wg.Add(cpus)
+		for i := 0; i < cpus; i++ {
+			slice := makeSlice(result, i, cpus).(*image.NRGBA64)
+			go func() {
+				defer wg.Done()
+				nearestNRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
 			}()
 			}()
 		}
 		}
 		wg.Wait()
 		wg.Wait()