From 58e8db3087a5851816a5ed72d5b6a5d80fc5721e Mon Sep 17 00:00:00 2001 From: Naushir Patuck Date: Fri, 27 Sep 2024 13:35:52 +0100 Subject: [PATCH] Add support for the IMX500 Add demo scripts for neural network inferencing on the IMX500 device Add an IMX500 devices helper library including sw postprocessing routines Signed-off-by: Naushir Patuck --- examples/imx500/assets/coco_labels.txt | 90 ++ examples/imx500/assets/colours.txt | 21 + examples/imx500/assets/imagenet_labels.txt | 1001 +++++++++++++++++ examples/imx500/imx500_classification_demo.py | 156 +++ .../imx500/imx500_object_detection_demo.py | 174 +++ .../imx500/imx500_object_detection_demo_mp.py | 194 ++++ ...imx500_pose_estimation_higherhrnet_demo.py | 117 ++ examples/imx500/imx500_segmentation_demo.py | 101 ++ picamera2/devices/__init__.py | 1 + picamera2/devices/imx500/__init__.py | 6 + picamera2/devices/imx500/imx500.py | 709 ++++++++++++ picamera2/devices/imx500/postprocess.py | 414 +++++++ .../imx500/postprocess_efficientdet_lite0.py | 213 ++++ .../devices/imx500/postprocess_highernet.py | 562 +++++++++ .../devices/imx500/postprocess_nanodet.py | 63 ++ .../devices/imx500/postprocess_yolov5.py | 244 ++++ .../devices/imx500/postprocess_yolov8.py | 176 +++ setup.py | 9 +- 18 files changed, 4248 insertions(+), 3 deletions(-) create mode 100644 examples/imx500/assets/coco_labels.txt create mode 100644 examples/imx500/assets/colours.txt create mode 100644 examples/imx500/assets/imagenet_labels.txt create mode 100755 examples/imx500/imx500_classification_demo.py create mode 100755 examples/imx500/imx500_object_detection_demo.py create mode 100755 examples/imx500/imx500_object_detection_demo_mp.py create mode 100755 examples/imx500/imx500_pose_estimation_higherhrnet_demo.py create mode 100755 examples/imx500/imx500_segmentation_demo.py create mode 100644 picamera2/devices/imx500/__init__.py create mode 100644 picamera2/devices/imx500/imx500.py create mode 100644 picamera2/devices/imx500/postprocess.py create mode 100644 picamera2/devices/imx500/postprocess_efficientdet_lite0.py create mode 100644 picamera2/devices/imx500/postprocess_highernet.py create mode 100644 picamera2/devices/imx500/postprocess_nanodet.py create mode 100644 picamera2/devices/imx500/postprocess_yolov5.py create mode 100644 picamera2/devices/imx500/postprocess_yolov8.py diff --git a/examples/imx500/assets/coco_labels.txt b/examples/imx500/assets/coco_labels.txt new file mode 100644 index 00000000..a76dd2a2 --- /dev/null +++ b/examples/imx500/assets/coco_labels.txt @@ -0,0 +1,90 @@ +person +bicycle +car +motorcycle +airplane +bus +train +truck +boat +traffic light +fire hydrant +- +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +- +backpack +umbrella +- +- +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +- +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +couch +potted plant +bed +- +dining table +- +- +toilet +- +tv +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +- +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/examples/imx500/assets/colours.txt b/examples/imx500/assets/colours.txt new file mode 100644 index 00000000..02141d11 --- /dev/null +++ b/examples/imx500/assets/colours.txt @@ -0,0 +1,21 @@ +128 0 0 255 +0 128 0 255 +128 128 0 255 +0 0 128 255 +128 0 128 255 +0 128 128 255 +128 128 128 255 +64 0 0 255 +192 0 0 255 +64 128 0 255 +192 128 0 255 +64 0 128 255 +192 0 128 255 +64 128 128 255 +192 128 128 255 +0 64 0 255 +128 64 0 255 +0 192 0 255 +128 192 0 255 +0 64 128 255 +0 0 0 255 \ No newline at end of file diff --git a/examples/imx500/assets/imagenet_labels.txt b/examples/imx500/assets/imagenet_labels.txt new file mode 100644 index 00000000..17459f63 --- /dev/null +++ b/examples/imx500/assets/imagenet_labels.txt @@ -0,0 +1,1001 @@ +0:background +1:tench, Tinca tinca +2:goldfish, Carassius auratus +3:great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias +4:tiger shark, Galeocerdo cuvieri +5:hammerhead, hammerhead shark +6:electric ray, crampfish, numbfish, torpedo +7:stingray +8:cock +9:hen +10:ostrich, Struthio camelus +11:brambling, Fringilla montifringilla +12:goldfinch, Carduelis carduelis +13:house finch, linnet, Carpodacus mexicanus +14:junco, snowbird +15:indigo bunting, indigo finch, indigo bird, Passerina cyanea +16:robin, American robin, Turdus migratorius +17:bulbul +18:jay +19:magpie +20:chickadee +21:water ouzel, dipper +22:kite +23:bald eagle, American eagle, Haliaeetus leucocephalus +24:vulture +25:great grey owl, great gray owl, Strix nebulosa +26:European fire salamander, Salamandra salamandra +27:common newt, Triturus vulgaris +28:eft +29:spotted salamander, Ambystoma maculatum +30:axolotl, mud puppy, Ambystoma mexicanum +31:bullfrog, Rana catesbeiana +32:tree frog, tree-frog +33:tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui +34:loggerhead, loggerhead turtle, Caretta caretta +35:leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea +36:mud turtle +37:terrapin +38:box turtle, box tortoise +39:banded gecko +40:common iguana, iguana, Iguana iguana +41:American chameleon, anole, Anolis carolinensis +42:whiptail, whiptail lizard +43:agama +44:frilled lizard, Chlamydosaurus kingi +45:alligator lizard +46:Gila monster, Heloderma suspectum +47:green lizard, Lacerta viridis +48:African chameleon, Chamaeleo chamaeleon +49:Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis +50:African crocodile, Nile crocodile, Crocodylus niloticus +51:American alligator, Alligator mississipiensis +52:triceratops +53:thunder snake, worm snake, Carphophis amoenus +54:ringneck snake, ring-necked snake, ring snake +55:hognose snake, puff adder, sand viper +56:green snake, grass snake +57:king snake, kingsnake +58:garter snake, grass snake +59:water snake +60:vine snake +61:night snake, Hypsiglena torquata +62:boa constrictor, Constrictor constrictor +63:rock python, rock snake, Python sebae +64:Indian cobra, Naja naja +65:green mamba +66:sea snake +67:horned viper, cerastes, sand viper, horned asp, Cerastes cornutus +68:diamondback, diamondback rattlesnake, Crotalus adamanteus +69:sidewinder, horned rattlesnake, Crotalus cerastes +70:trilobite +71:harvestman, daddy longlegs, Phalangium opilio +72:scorpion +73:black and gold garden spider, Argiope aurantia +74:barn spider, Araneus cavaticus +75:garden spider, Aranea diademata +76:black widow, Latrodectus mactans +77:tarantula +78:wolf spider, hunting spider +79:tick +80:centipede +81:black grouse +82:ptarmigan +83:ruffed grouse, partridge, Bonasa umbellus +84:prairie chicken, prairie grouse, prairie fowl +85:peacock +86:quail +87:partridge +88:African grey, African gray, Psittacus erithacus +89:macaw +90:sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita +91:lorikeet +92:coucal +93:bee eater +94:hornbill +95:hummingbird +96:jacamar +97:toucan +98:drake +99:red-breasted merganser, Mergus serrator +100:goose +101:black swan, Cygnus atratus +102:tusker +103:echidna, spiny anteater, anteater +104:platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus +105:wallaby, brush kangaroo +106:koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus +107:wombat +108:jellyfish +109:sea anemone, anemone +110:brain coral +111:flatworm, platyhelminth +112:nematode, nematode worm, roundworm +113:conch +114:snail +115:slug +116:sea slug, nudibranch +117:chiton, coat-of-mail shell, sea cradle, polyplacophore +118:chambered nautilus, pearly nautilus, nautilus +119:Dungeness crab, Cancer magister +120:rock crab, Cancer irroratus +121:fiddler crab +122:king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica +123:American lobster, Northern lobster, Maine lobster, Homarus americanus +124:spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish +125:crayfish, crawfish, crawdad, crawdaddy +126:hermit crab +127:isopod +128:white stork, Ciconia ciconia +129:black stork, Ciconia nigra +130:spoonbill +131:flamingo +132:little blue heron, Egretta caerulea +133:American egret, great white heron, Egretta albus +134:bittern +135:crane +136:limpkin, Aramus pictus +137:European gallinule, Porphyrio porphyrio +138:American coot, marsh hen, mud hen, water hen, Fulica americana +139:bustard +140:ruddy turnstone, Arenaria interpres +141:red-backed sandpiper, dunlin, Erolia alpina +142:redshank, Tringa totanus +143:dowitcher +144:oystercatcher, oyster catcher +145:pelican +146:king penguin, Aptenodytes patagonica +147:albatross, mollymawk +148:grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus +149:killer whale, killer, orca, grampus, sea wolf, Orcinus orca +150:dugong, Dugong dugon +151:sea lion +152:Chihuahua +153:Japanese spaniel +154:Maltese dog, Maltese terrier, Maltese +155:Pekinese, Pekingese, Peke +156:Shih-Tzu +157:Blenheim spaniel +158:papillon +159:toy terrier +160:Rhodesian ridgeback +161:Afghan hound, Afghan +162:basset, basset hound +163:beagle +164:bloodhound, sleuthhound +165:bluetick +166:black-and-tan coonhound +167:Walker hound, Walker foxhound +168:English foxhound +169:redbone +170:borzoi, Russian wolfhound +171:Irish wolfhound +172:Italian greyhound +173:whippet +174:Ibizan hound, Ibizan Podenco +175:Norwegian elkhound, elkhound +176:otterhound, otter hound +177:Saluki, gazelle hound +178:Scottish deerhound, deerhound +179:Weimaraner +180:Staffordshire bullterrier, Staffordshire bull terrier +181:American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier +182:Bedlington terrier +183:Border terrier +184:Kerry blue terrier +185:Irish terrier +186:Norfolk terrier +187:Norwich terrier +188:Yorkshire terrier +189:wire-haired fox terrier +190:Lakeland terrier +191:Sealyham terrier, Sealyham +192:Airedale, Airedale terrier +193:cairn, cairn terrier +194:Australian terrier +195:Dandie Dinmont, Dandie Dinmont terrier +196:Boston bull, Boston terrier +197:miniature schnauzer +198:giant schnauzer +199:standard schnauzer +200:Scotch terrier, Scottish terrier, Scottie +201:Tibetan terrier, chrysanthemum dog +202:silky terrier, Sydney silky +203:soft-coated wheaten terrier +204:West Highland white terrier +205:Lhasa, Lhasa apso +206:flat-coated retriever +207:curly-coated retriever +208:golden retriever +209:Labrador retriever +210:Chesapeake Bay retriever +211:German short-haired pointer +212:vizsla, Hungarian pointer +213:English setter +214:Irish setter, red setter +215:Gordon setter +216:Brittany spaniel +217:clumber, clumber spaniel +218:English springer, English springer spaniel +219:Welsh springer spaniel +220:cocker spaniel, English cocker spaniel, cocker +221:Sussex spaniel +222:Irish water spaniel +223:kuvasz +224:schipperke +225:groenendael +226:malinois +227:briard +228:kelpie +229:komondor +230:Old English sheepdog, bobtail +231:Shetland sheepdog, Shetland sheep dog, Shetland +232:collie +233:Border collie +234:Bouvier des Flandres, Bouviers des Flandres +235:Rottweiler +236:German shepherd, German shepherd dog, German police dog, alsatian +237:Doberman, Doberman pinscher +238:miniature pinscher +239:Greater Swiss Mountain dog +240:Bernese mountain dog +241:Appenzeller +242:EntleBucher +243:boxer +244:bull mastiff +245:Tibetan mastiff +246:French bulldog +247:Great Dane +248:Saint Bernard, St Bernard +249:Eskimo dog, husky +250:malamute, malemute, Alaskan malamute +251:Siberian husky +252:dalmatian, coach dog, carriage dog +253:affenpinscher, monkey pinscher, monkey dog +254:basenji +255:pug, pug-dog +256:Leonberg +257:Newfoundland, Newfoundland dog +258:Great Pyrenees +259:Samoyed, Samoyede +260:Pomeranian +261:chow, chow chow +262:keeshond +263:Brabancon griffon +264:Pembroke, Pembroke Welsh corgi +265:Cardigan, Cardigan Welsh corgi +266:toy poodle +267:miniature poodle +268:standard poodle +269:Mexican hairless +270:timber wolf, grey wolf, gray wolf, Canis lupus +271:white wolf, Arctic wolf, Canis lupus tundrarum +272:red wolf, maned wolf, Canis rufus, Canis niger +273:coyote, prairie wolf, brush wolf, Canis latrans +274:dingo, warrigal, warragal, Canis dingo +275:dhole, Cuon alpinus +276:African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus +277:hyena, hyaena +278:red fox, Vulpes vulpes +279:kit fox, Vulpes macrotis +280:Arctic fox, white fox, Alopex lagopus +281:grey fox, gray fox, Urocyon cinereoargenteus +282:tabby, tabby cat +283:tiger cat +284:Persian cat +285:Siamese cat, Siamese +286:Egyptian cat +287:cougar, puma, catamount, mountain lion, painter, panther, Felis concolor +288:lynx, catamount +289:leopard, Panthera pardus +290:snow leopard, ounce, Panthera uncia +291:jaguar, panther, Panthera onca, Felis onca +292:lion, king of beasts, Panthera leo +293:tiger, Panthera tigris +294:cheetah, chetah, Acinonyx jubatus +295:brown bear, bruin, Ursus arctos +296:American black bear, black bear, Ursus americanus, Euarctos americanus +297:ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus +298:sloth bear, Melursus ursinus, Ursus ursinus +299:mongoose +300:meerkat, mierkat +301:tiger beetle +302:ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle +303:ground beetle, carabid beetle +304:long-horned beetle, longicorn, longicorn beetle +305:leaf beetle, chrysomelid +306:dung beetle +307:rhinoceros beetle +308:weevil +309:fly +310:bee +311:ant, emmet, pismire +312:grasshopper, hopper +313:cricket +314:walking stick, walkingstick, stick insect +315:cockroach, roach +316:mantis, mantid +317:cicada, cicala +318:leafhopper +319:lacewing, lacewing fly +320:dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk +321:damselfly +322:admiral +323:ringlet, ringlet butterfly +324:monarch, monarch butterfly, milkweed butterfly, Danaus plexippus +325:cabbage butterfly +326:sulphur butterfly, sulfur butterfly +327:lycaenid, lycaenid butterfly +328:starfish, sea star +329:sea urchin +330:sea cucumber, holothurian +331:wood rabbit, cottontail, cottontail rabbit +332:hare +333:Angora, Angora rabbit +334:hamster +335:porcupine, hedgehog +336:fox squirrel, eastern fox squirrel, Sciurus niger +337:marmot +338:beaver +339:guinea pig, Cavia cobaya +340:sorrel +341:zebra +342:hog, pig, grunter, squealer, Sus scrofa +343:wild boar, boar, Sus scrofa +344:warthog +345:hippopotamus, hippo, river horse, Hippopotamus amphibius +346:ox +347:water buffalo, water ox, Asiatic buffalo, Bubalus bubalis +348:bison +349:ram, tup +350:bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis +351:ibex, Capra ibex +352:hartebeest +353:impala, Aepyceros melampus +354:gazelle +355:Arabian camel, dromedary, Camelus dromedarius +356:llama +357:weasel +358:mink +359:polecat, fitch, foulmart, foumart, Mustela putorius +360:black-footed ferret, ferret, Mustela nigripes +361:otter +362:skunk, polecat, wood pussy +363:badger +364:armadillo +365:three-toed sloth, ai, Bradypus tridactylus +366:orangutan, orang, orangutang, Pongo pygmaeus +367:gorilla, Gorilla gorilla +368:chimpanzee, chimp, Pan troglodytes +369:gibbon, Hylobates lar +370:siamang, Hylobates syndactylus, Symphalangus syndactylus +371:guenon, guenon monkey +372:patas, hussar monkey, Erythrocebus patas +373:baboon +374:macaque +375:langur +376:colobus, colobus monkey +377:proboscis monkey, Nasalis larvatus +378:marmoset +379:capuchin, ringtail, Cebus capucinus +380:howler monkey, howler +381:titi, titi monkey +382:spider monkey, Ateles geoffroyi +383:squirrel monkey, Saimiri sciureus +384:Madagascar cat, ring-tailed lemur, Lemur catta +385:indri, indris, Indri indri, Indri brevicaudatus +386:Indian elephant, Elephas maximus +387:African elephant, Loxodonta africana +388:lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens +389:giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca +390:barracouta, snoek +391:eel +392:coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch +393:rock beauty, Holocanthus tricolor +394:anemone fish +395:sturgeon +396:gar, garfish, garpike, billfish, Lepisosteus osseus +397:lionfish +398:puffer, pufferfish, blowfish, globefish +399:abacus +400:abaya +401:academic gown, academic robe, judge's robe +402:accordion, piano accordion, squeeze box +403:acoustic guitar +404:aircraft carrier, carrier, flattop, attack aircraft carrier +405:airliner +406:airship, dirigible +407:altar +408:ambulance +409:amphibian, amphibious vehicle +410:analog clock +411:apiary, bee house +412:apron +413:ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin +414:assault rifle, assault gun +415:backpack, back pack, knapsack, packsack, rucksack, haversack +416:bakery, bakeshop, bakehouse +417:balance beam, beam +418:balloon +419:ballpoint, ballpoint pen, ballpen, Biro +420:Band Aid +421:banjo +422:bannister, banister, balustrade, balusters, handrail +423:barbell +424:barber chair +425:barbershop +426:barn +427:barometer +428:barrel, cask +429:barrow, garden cart, lawn cart, wheelbarrow +430:baseball +431:basketball +432:bassinet +433:bassoon +434:bathing cap, swimming cap +435:bath towel +436:bathtub, bathing tub, bath, tub +437:beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon +438:beacon, lighthouse, beacon light, pharos +439:beaker +440:bearskin, busby, shako +441:beer bottle +442:beer glass +443:bell cote, bell cot +444:bib +445:bicycle-built-for-two, tandem bicycle, tandem +446:bikini, two-piece +447:binder, ring-binder +448:binoculars, field glasses, opera glasses +449:birdhouse +450:boathouse +451:bobsled, bobsleigh, bob +452:bolo tie, bolo, bola tie, bola +453:bonnet, poke bonnet +454:bookcase +455:bookshop, bookstore, bookstall +456:bottlecap +457:bow +458:bow tie, bow-tie, bowtie +459:brass, memorial tablet, plaque +460:brassiere, bra, bandeau +461:breakwater, groin, groyne, mole, bulwark, seawall, jetty +462:breastplate, aegis, egis +463:broom +464:bucket, pail +465:buckle +466:bulletproof vest +467:bullet train, bullet +468:butcher shop, meat market +469:cab, hack, taxi, taxicab +470:caldron, cauldron +471:candle, taper, wax light +472:cannon +473:canoe +474:can opener, tin opener +475:cardigan +476:car mirror +477:carousel, carrousel, merry-go-round, roundabout, whirligig +478:carpenter's kit, tool kit +479:carton +480:car wheel +481:cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM +482:cassette +483:cassette player +484:castle +485:catamaran +486:CD player +487:cello, violoncello +488:cellular telephone, cellular phone, cellphone, cell, mobile phone +489:chain +490:chainlink fence +491:chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour +492:chain saw, chainsaw +493:chest +494:chiffonier, commode +495:chime, bell, gong +496:china cabinet, china closet +497:Christmas stocking +498:church, church building +499:cinema, movie theater, movie theatre, movie house, picture palace +500:cleaver, meat cleaver, chopper +501:cliff dwelling +502:cloak +503:clog, geta, patten, sabot +504:cocktail shaker +505:coffee mug +506:coffeepot +507:coil, spiral, volute, whorl, helix +508:combination lock +509:computer keyboard, keypad +510:confectionery, confectionary, candy store +511:container ship, containership, container vessel +512:convertible +513:corkscrew, bottle screw +514:cornet, horn, trumpet, trump +515:cowboy boot +516:cowboy hat, ten-gallon hat +517:cradle +518:crane +519:crash helmet +520:crate +521:crib, cot +522:Crock Pot +523:croquet ball +524:crutch +525:cuirass +526:dam, dike, dyke +527:desk +528:desktop computer +529:dial telephone, dial phone +530:diaper, nappy, napkin +531:digital clock +532:digital watch +533:dining table, board +534:dishrag, dishcloth +535:dishwasher, dish washer, dishwashing machine +536:disk brake, disc brake +537:dock, dockage, docking facility +538:dogsled, dog sled, dog sleigh +539:dome +540:doormat, welcome mat +541:drilling platform, offshore rig +542:drum, membranophone, tympan +543:drumstick +544:dumbbell +545:Dutch oven +546:electric fan, blower +547:electric guitar +548:electric locomotive +549:entertainment center +550:envelope +551:espresso maker +552:face powder +553:feather boa, boa +554:file, file cabinet, filing cabinet +555:fireboat +556:fire engine, fire truck +557:fire screen, fireguard +558:flagpole, flagstaff +559:flute, transverse flute +560:folding chair +561:football helmet +562:forklift +563:fountain +564:fountain pen +565:four-poster +566:freight car +567:French horn, horn +568:frying pan, frypan, skillet +569:fur coat +570:garbage truck, dustcart +571:gasmask, respirator, gas helmet +572:gas pump, gasoline pump, petrol pump, island dispenser +573:goblet +574:go-kart +575:golf ball +576:golfcart, golf cart +577:gondola +578:gong, tam-tam +579:gown +580:grand piano, grand +581:greenhouse, nursery, glasshouse +582:grille, radiator grille +583:grocery store, grocery, food market, market +584:guillotine +585:hair slide +586:hair spray +587:half track +588:hammer +589:hamper +590:hand blower, blow dryer, blow drier, hair dryer, hair drier +591:hand-held computer, hand-held microcomputer +592:handkerchief, hankie, hanky, hankey +593:hard disc, hard disk, fixed disk +594:harmonica, mouth organ, harp, mouth harp +595:harp +596:harvester, reaper +597:hatchet +598:holster +599:home theater, home theatre +600:honeycomb +601:hook, claw +602:hoopskirt, crinoline +603:horizontal bar, high bar +604:horse cart, horse-cart +605:hourglass +606:iPod +607:iron, smoothing iron +608:jack-o'-lantern +609:jean, blue jean, denim +610:jeep, landrover +611:jersey, T-shirt, tee shirt +612:jigsaw puzzle +613:jinrikisha, ricksha, rickshaw +614:joystick +615:kimono +616:knee pad +617:knot +618:lab coat, laboratory coat +619:ladle +620:lampshade, lamp shade +621:laptop, laptop computer +622:lawn mower, mower +623:lens cap, lens cover +624:letter opener, paper knife, paperknife +625:library +626:lifeboat +627:lighter, light, igniter, ignitor +628:limousine, limo +629:liner, ocean liner +630:lipstick, lip rouge +631:Loafer +632:lotion +633:loudspeaker, speaker, speaker unit, loudspeaker system, speaker system +634:loupe, jeweler's loupe +635:lumbermill, sawmill +636:magnetic compass +637:mailbag, postbag +638:mailbox, letter box +639:maillot +640:maillot, tank suit +641:manhole cover +642:maraca +643:marimba, xylophone +644:mask +645:matchstick +646:maypole +647:maze, labyrinth +648:measuring cup +649:medicine chest, medicine cabinet +650:megalith, megalithic structure +651:microphone, mike +652:microwave, microwave oven +653:military uniform +654:milk can +655:minibus +656:miniskirt, mini +657:minivan +658:missile +659:mitten +660:mixing bowl +661:mobile home, manufactured home +662:Model T +663:modem +664:monastery +665:monitor +666:moped +667:mortar +668:mortarboard +669:mosque +670:mosquito net +671:motor scooter, scooter +672:mountain bike, all-terrain bike, off-roader +673:mountain tent +674:mouse, computer mouse +675:mousetrap +676:moving van +677:muzzle +678:nail +679:neck brace +680:necklace +681:nipple +682:notebook, notebook computer +683:obelisk +684:oboe, hautboy, hautbois +685:ocarina, sweet potato +686:odometer, hodometer, mileometer, milometer +687:oil filter +688:organ, pipe organ +689:oscilloscope, scope, cathode-ray oscilloscope, CRO +690:overskirt +691:oxcart +692:oxygen mask +693:packet +694:paddle, boat paddle +695:paddlewheel, paddle wheel +696:padlock +697:paintbrush +698:pajama, pyjama, pj's, jammies +699:palace +700:panpipe, pandean pipe, syrinx +701:paper towel +702:parachute, chute +703:parallel bars, bars +704:park bench +705:parking meter +706:passenger car, coach, carriage +707:patio, terrace +708:pay-phone, pay-station +709:pedestal, plinth, footstall +710:pencil box, pencil case +711:pencil sharpener +712:perfume, essence +713:Petri dish +714:photocopier +715:pick, plectrum, plectron +716:pickelhaube +717:picket fence, paling +718:pickup, pickup truck +719:pier +720:piggy bank, penny bank +721:pill bottle +722:pillow +723:ping-pong ball +724:pinwheel +725:pirate, pirate ship +726:pitcher, ewer +727:plane, carpenter's plane, woodworking plane +728:planetarium +729:plastic bag +730:plate rack +731:plow, plough +732:plunger, plumber's helper +733:Polaroid camera, Polaroid Land camera +734:pole +735:police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria +736:poncho +737:pool table, billiard table, snooker table +738:pop bottle, soda bottle +739:pot, flowerpot +740:potter's wheel +741:power drill +742:prayer rug, prayer mat +743:printer +744:prison, prison house +745:projectile, missile +746:projector +747:puck, hockey puck +748:punching bag, punch bag, punching ball, punchball +749:purse +750:quill, quill pen +751:quilt, comforter, comfort, puff +752:racer, race car, racing car +753:racket, racquet +754:radiator +755:radio, wireless +756:radio telescope, radio reflector +757:rain barrel +758:recreational vehicle, RV, R.V. +759:reel +760:reflex camera +761:refrigerator, icebox +762:remote control, remote +763:restaurant, eating house, eating place, eatery +764:revolver, six-gun, six-shooter +765:rifle +766:rocking chair, rocker +767:rotisserie +768:rubber eraser, rubber, pencil eraser +769:rugby ball +770:rule, ruler +771:running shoe +772:safe +773:safety pin +774:saltshaker, salt shaker +775:sandal +776:sarong +777:sax, saxophone +778:scabbard +779:scale, weighing machine +780:school bus +781:schooner +782:scoreboard +783:screen, CRT screen +784:screw +785:screwdriver +786:seat belt, seatbelt +787:sewing machine +788:shield, buckler +789:shoe shop, shoe-shop, shoe store +790:shoji +791:shopping basket +792:shopping cart +793:shovel +794:shower cap +795:shower curtain +796:ski +797:ski mask +798:sleeping bag +799:slide rule, slipstick +800:sliding door +801:slot, one-armed bandit +802:snorkel +803:snowmobile +804:snowplow, snowplough +805:soap dispenser +806:soccer ball +807:sock +808:solar dish, solar collector, solar furnace +809:sombrero +810:soup bowl +811:space bar +812:space heater +813:space shuttle +814:spatula +815:speedboat +816:spider web, spider's web +817:spindle +818:sports car, sport car +819:spotlight, spot +820:stage +821:steam locomotive +822:steel arch bridge +823:steel drum +824:stethoscope +825:stole +826:stone wall +827:stopwatch, stop watch +828:stove +829:strainer +830:streetcar, tram, tramcar, trolley, trolley car +831:stretcher +832:studio couch, day bed +833:stupa, tope +834:submarine, pigboat, sub, U-boat +835:suit, suit of clothes +836:sundial +837:sunglass +838:sunglasses, dark glasses, shades +839:sunscreen, sunblock, sun blocker +840:suspension bridge +841:swab, swob, mop +842:sweatshirt +843:swimming trunks, bathing trunks +844:swing +845:switch, electric switch, electrical switch +846:syringe +847:table lamp +848:tank, army tank, armored combat vehicle, armoured combat vehicle +849:tape player +850:teapot +851:teddy, teddy bear +852:television, television system +853:tennis ball +854:thatch, thatched roof +855:theater curtain, theatre curtain +856:thimble +857:thresher, thrasher, threshing machine +858:throne +859:tile roof +860:toaster +861:tobacco shop, tobacconist shop, tobacconist +862:toilet seat +863:torch +864:totem pole +865:tow truck, tow car, wrecker +866:toyshop +867:tractor +868:trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi +869:tray +870:trench coat +871:tricycle, trike, velocipede +872:trimaran +873:tripod +874:triumphal arch +875:trolleybus, trolley coach, trackless trolley +876:trombone +877:tub, vat +878:turnstile +879:typewriter keyboard +880:umbrella +881:unicycle, monocycle +882:upright, upright piano +883:vacuum, vacuum cleaner +884:vase +885:vault +886:velvet +887:vending machine +888:vestment +889:viaduct +890:violin, fiddle +891:volleyball +892:waffle iron +893:wall clock +894:wallet, billfold, notecase, pocketbook +895:wardrobe, closet, press +896:warplane, military plane +897:washbasin, handbasin, washbowl, lavabo, wash-hand basin +898:washer, automatic washer, washing machine +899:water bottle +900:water jug +901:water tower +902:whiskey jug +903:whistle +904:wig +905:window screen +906:window shade +907:Windsor tie +908:wine bottle +909:wing +910:wok +911:wooden spoon +912:wool, woolen, woollen +913:worm fence, snake fence, snake-rail fence, Virginia fence +914:wreck +915:yawl +916:yurt +917:web site, website, internet site, site +918:comic book +919:crossword puzzle, crossword +920:street sign +921:traffic light, traffic signal, stoplight +922:book jacket, dust cover, dust jacket, dust wrapper +923:menu +924:plate +925:guacamole +926:consomme +927:hot pot, hotpot +928:trifle +929:ice cream, icecream +930:ice lolly, lolly, lollipop, popsicle +931:French loaf +932:bagel, beigel +933:pretzel +934:cheeseburger +935:hotdog, hot dog, red hot +936:mashed potato +937:head cabbage +938:broccoli +939:cauliflower +940:zucchini, courgette +941:spaghetti squash +942:acorn squash +943:butternut squash +944:cucumber, cuke +945:artichoke, globe artichoke +946:bell pepper +947:cardoon +948:mushroom +949:Granny Smith +950:strawberry +951:orange +952:lemon +953:fig +954:pineapple, ananas +955:banana +956:jackfruit, jak, jack +957:custard apple +958:pomegranate +959:hay +960:carbonara +961:chocolate sauce, chocolate syrup +962:dough +963:meat loaf, meatloaf +964:pizza, pizza pie +965:potpie +966:burrito +967:red wine +968:espresso +969:cup +970:eggnog +971:alp +972:bubble +973:cliff, drop, drop-off +974:coral reef +975:geyser +976:lakeside, lakeshore +977:promontory, headland, head, foreland +978:sandbar, sand bar +979:seashore, coast, seacoast, sea-coast +980:valley, vale +981:volcano +982:ballplayer, baseball player +983:groom, bridegroom +984:scuba diver +985:rapeseed +986:daisy +987:yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum +988:corn +989:acorn +990:hip, rose hip, rosehip +991:buckeye, horse chestnut, conker +992:coral fungus +993:agaric +994:gyromitra +995:stinkhorn, carrion fungus +996:earthstar +997:hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa +998:bolete +999:ear, spike, capitulum +1000:toilet tissue, toilet paper, bathroom tissue \ No newline at end of file diff --git a/examples/imx500/imx500_classification_demo.py b/examples/imx500/imx500_classification_demo.py new file mode 100755 index 00000000..1742471a --- /dev/null +++ b/examples/imx500/imx500_classification_demo.py @@ -0,0 +1,156 @@ +import argparse +import sys +import time +from typing import List + +import cv2 +import numpy as np + +from picamera2 import CompletedRequest, MappedArray, Picamera2 +from picamera2.devices import IMX500 +from picamera2.devices.imx500 import NetworkIntrinsics +from picamera2.devices.imx500.postprocess import softmax + +last_detections = [] +LABELS = None + + +class Classification: + def __init__(self, idx: int, score: float): + """Create a Classification object, recording the idx and score.""" + self.idx = idx + self.score = score + + +def get_label(request: CompletedRequest, idx: int) -> str: + """Retrieve the label corresponding to the classification index.""" + global LABELS + if LABELS is None: + LABELS = intrinsics.labels + assert len(LABELS) in [1000, 1001], "Labels file should contain 1000 or 1001 labels." + output_tensor_size = imx500.get_output_shapes(request.get_metadata())[0][0] + if output_tensor_size == 1000: + LABELS = LABELS[1:] # Ignore the background label if present + return LABELS[idx] + + +def parse_and_draw_classification_results(request: CompletedRequest): + """Analyse and draw the classification results in the output tensor.""" + results = parse_classification_results(request) + draw_classification_results(request, results) + + +def parse_classification_results(request: CompletedRequest) -> List[Classification]: + """Parse the output tensor into the classification results above the threshold.""" + global last_detections + np_outputs = imx500.get_outputs(request.get_metadata()) + if np_outputs is None: + return last_detections + np_output = np_outputs[0] + if intrinsics.softmax: + np_output = softmax(np_output) + top_indices = np.argpartition(-np_output, 3)[:3] # Get top 3 indices with the highest scores + top_indices = top_indices[np.argsort(-np_output[top_indices])] # Sort the top 3 indices by their scores + last_detections = [Classification(index, np_output[index]) for index in top_indices] + return last_detections + + +def draw_classification_results(request: CompletedRequest, results: List[Classification], stream: str = "main"): + """Draw the classification results for this request onto the ISP output.""" + with MappedArray(request, stream) as m: + if intrinsics.preserve_aspect_ratio: + # Drawing ROI box + b_x, b_y, b_w, b_h = imx500.get_roi_scaled(request) + color = (255, 0, 0) # red + cv2.putText(m.array, "ROI", (b_x + 5, b_y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1) + cv2.rectangle(m.array, (b_x, b_y), (b_x + b_w, b_y + b_h), (255, 0, 0, 0)) + text_left, text_top = b_x, b_y + 20 + else: + text_left, text_top = 0, 0 + # Drawing labels (in the ROI box if it exists) + for index, result in enumerate(results): + label = get_label(request, idx=result.idx) + text = f"{label}: {result.score:.3f}" + + # Calculate text size and position + (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + text_x = text_left + 5 + text_y = text_top + 15 + index * 20 + + # Create a copy of the array to draw the background with opacity + overlay = m.array.copy() + + # Draw the background rectangle on the overlay + cv2.rectangle(overlay, + (text_x, text_y - text_height), + (text_x + text_width, text_y + baseline), + (255, 255, 255), # Background color (white) + cv2.FILLED) + + alpha = 0.3 + cv2.addWeighted(overlay, alpha, m.array, 1 - alpha, 0, m.array) + + # Draw text on top of the background + cv2.putText(m.array, text, (text_x, text_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) + + +def get_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="Path of the model", + default="/usr/share/imx500-models/imx500_network_mobilenet_v2.rpk") + parser.add_argument("--fps", type=int, help="Frames per second") + parser.add_argument("-s", "--softmax", action=argparse.BooleanOptionalAction, help="Add post-process softmax") + parser.add_argument("-r", "--preserve-aspect-ratio", action=argparse.BooleanOptionalAction, + help="preprocess the image with preserve aspect ratio") + parser.add_argument("--labels", type=str, + help="Path to the labels file") + parser.add_argument("--print-intrinsics", action="store_true", + help="Print JSON network_intrinsics then exit") + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + # This must be called before instantiation of Picamera2 + imx500 = IMX500(args.model) + intrinsics = imx500.network_intrinsics + if not intrinsics: + intrinsics = NetworkIntrinsics() + intrinsics.task = "classification" + elif intrinsics.task != "classification": + print("Network is not a classification task", file=sys.stderr) + exit() + + # Override intrinsics from args + for key, value in vars(args).items(): + if key == 'labels' and value is not None: + with open(value, 'r') as f: + intrinsics.labels = f.read().splitlines() + elif hasattr(intrinsics, key) and value is not None: + setattr(intrinsics, key, value) + + # Defaults + if intrinsics.labels is None: + with open("assets/imagenet_labels.txt", "r") as f: + intrinsics.labels = f.read().splitlines() + intrinsics.update_with_defaults() + + if args.print_intrinsics: + print(intrinsics) + exit() + + picam2 = Picamera2(imx500.camera_num) + config = picam2.create_preview_configuration(controls={"FrameRate": intrinsics.inference_rate}, buffer_count=12) + + imx500.show_network_fw_progress_bar() + picam2.start(config, show_preview=True) + if intrinsics.preserve_aspect_ratio: + imx500.set_auto_aspect_ratio() + # Register the callback to parse and draw classification results + picam2.pre_callback = parse_and_draw_classification_results + + while True: + time.sleep(0.5) diff --git a/examples/imx500/imx500_object_detection_demo.py b/examples/imx500/imx500_object_detection_demo.py new file mode 100755 index 00000000..ab792975 --- /dev/null +++ b/examples/imx500/imx500_object_detection_demo.py @@ -0,0 +1,174 @@ +import argparse +import sys +from functools import lru_cache + +import cv2 +import numpy as np + +from picamera2 import MappedArray, Picamera2 +from picamera2.devices import IMX500 +from picamera2.devices.imx500 import (NetworkIntrinsics, + postprocess_nanodet_detection) + +last_detections = [] + + +class Detection: + def __init__(self, coords, category, conf, metadata): + """Create a Detection object, recording the bounding box, category and confidence.""" + self.category = category + self.conf = conf + self.box = imx500.convert_inference_coords(coords, metadata, picam2) + + +def parse_detections(metadata: dict): + """Parse the output tensor into a number of detected objects, scaled to the ISP out.""" + global last_detections + bbox_normalization = intrinsics.bbox_normalization + threshold = args.threshold + iou = args.iou + max_detections = args.max_detections + + np_outputs = imx500.get_outputs(metadata, add_batch=True) + input_w, input_h = imx500.get_input_size() + if np_outputs is None: + return last_detections + if intrinsics.postprocess == "nanodet": + boxes, scores, classes = \ + postprocess_nanodet_detection(outputs=np_outputs[0], conf=threshold, iou_thres=iou, + max_out_dets=max_detections)[0] + from picamera2.devices.imx500.postprocess import scale_boxes + boxes = scale_boxes(boxes, 1, 1, input_h, input_w, False, False) + else: + boxes, scores, classes = np_outputs[0][0], np_outputs[1][0], np_outputs[2][0] + if bbox_normalization: + boxes = boxes / input_h + + boxes = np.array_split(boxes, 4, axis=1) + boxes = zip(*boxes) + + last_detections = [ + Detection(box, category, score, metadata) + for box, score, category in zip(boxes, scores, classes) + if score > threshold + ] + return last_detections + + +@lru_cache +def get_labels(): + labels = intrinsics.labels + + if intrinsics.ignore_dash_labels: + labels = [label for label in labels if label and label != "-"] + return labels + + +def draw_detections(request, stream="main"): + """Draw the detections for this request onto the ISP output.""" + detections = last_results + if detections is None: + return + labels = get_labels() + with MappedArray(request, stream) as m: + for detection in detections: + x, y, w, h = detection.box + label = f"{labels[int(detection.category)]} ({detection.conf:.2f})" + + # Calculate text size and position + (text_width, text_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + text_x = x + 5 + text_y = y + 15 + + # Create a copy of the array to draw the background with opacity + overlay = m.array.copy() + + # Draw the background rectangle on the overlay + cv2.rectangle(overlay, + (text_x, text_y - text_height), + (text_x + text_width, text_y + baseline), + (255, 255, 255), # Background color (white) + cv2.FILLED) + + alpha = 0.30 + cv2.addWeighted(overlay, alpha, m.array, 1 - alpha, 0, m.array) + + # Draw text on top of the background + cv2.putText(m.array, label, (text_x, text_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) + + # Draw detection box + cv2.rectangle(m.array, (x, y), (x + w, y + h), (0, 255, 0, 0), thickness=2) + + if intrinsics.preserve_aspect_ratio: + b_x, b_y, b_w, b_h = imx500.get_roi_scaled(request) + color = (255, 0, 0) # red + cv2.putText(m.array, "ROI", (b_x + 5, b_y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1) + cv2.rectangle(m.array, (b_x, b_y), (b_x + b_w, b_y + b_h), (255, 0, 0, 0)) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="Path of the model", + default="/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk") + parser.add_argument("--fps", type=int, help="Frames per second") + parser.add_argument("--bbox-normalization", action=argparse.BooleanOptionalAction, help="Normalize bbox") + parser.add_argument("--threshold", type=float, default=0.55, help="Detection threshold") + parser.add_argument("--iou", type=float, default=0.65, help="Set iou threshold") + parser.add_argument("--max-detections", type=int, default=10, help="Set max detections") + parser.add_argument("--ignore-dash-labels", action=argparse.BooleanOptionalAction, help="Remove '-' labels ") + parser.add_argument("--postprocess", choices=["", "nanodet"], + default=None, help="Run post process of type") + parser.add_argument("-r", "--preserve-aspect-ratio", action=argparse.BooleanOptionalAction, + help="preserve the pixel aspect ratio of the input tensor") + parser.add_argument("--labels", type=str, + help="Path to the labels file") + parser.add_argument("--print-intrinsics", action="store_true", + help="Print JSON network_intrinsics then exit") + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + # This must be called before instantiation of Picamera2 + imx500 = IMX500(args.model) + intrinsics = imx500.network_intrinsics + if not intrinsics: + intrinsics = NetworkIntrinsics() + intrinsics.task = "object detection" + elif intrinsics.task != "object detection": + print("Network is not an object detection task", file=sys.stderr) + exit() + + # Override intrinsics from args + for key, value in vars(args).items(): + if key == 'labels' and value is not None: + with open(value, 'r') as f: + intrinsics.labels = f.read().splitlines() + elif hasattr(intrinsics, key) and value is not None: + setattr(intrinsics, key, value) + + # Defaults + if intrinsics.labels is None: + with open("assets/coco_labels.txt", "r") as f: + intrinsics.labels = f.read().splitlines() + intrinsics.update_with_defaults() + + if args.print_intrinsics: + print(intrinsics) + exit() + + picam2 = Picamera2(imx500.camera_num) + config = picam2.create_preview_configuration(controls={"FrameRate": intrinsics.inference_rate}, buffer_count=12) + + imx500.show_network_fw_progress_bar() + picam2.start(config, show_preview=True) + + if intrinsics.preserve_aspect_ratio: + imx500.set_auto_aspect_ratio() + + last_results = None + picam2.pre_callback = draw_detections + while True: + last_results = parse_detections(picam2.capture_metadata()) diff --git a/examples/imx500/imx500_object_detection_demo_mp.py b/examples/imx500/imx500_object_detection_demo_mp.py new file mode 100755 index 00000000..3bce0e6f --- /dev/null +++ b/examples/imx500/imx500_object_detection_demo_mp.py @@ -0,0 +1,194 @@ +import argparse +import multiprocessing +import queue +import sys +import threading +from functools import lru_cache + +import cv2 +import numpy as np + +from picamera2 import MappedArray, Picamera2 +from picamera2.devices import IMX500 +from picamera2.devices.imx500 import (NetworkIntrinsics, + postprocess_nanodet_detection) + + +class Detection: + def __init__(self, coords, category, conf, metadata): + """Create a Detection object, recording the bounding box, category and confidence.""" + self.category = category + self.conf = conf + self.box = imx500.convert_inference_coords(coords, metadata, picam2) + + +def parse_detections(metadata: dict): + """Parse the output tensor into a number of detected objects, scaled to the ISP out.""" + bbox_normalization = intrinsics.bbox_normalization + threshold = args.threshold + iou = args.iou + max_detections = args.max_detections + + np_outputs = imx500.get_outputs(metadata, add_batch=True) + input_w, input_h = imx500.get_input_size() + if np_outputs is None: + return None + if intrinsics.postprocess == "nanodet": + boxes, scores, classes = \ + postprocess_nanodet_detection(outputs=np_outputs[0], conf=threshold, iou_thres=iou, + max_out_dets=max_detections)[0] + from picamera2.devices.imx500.postprocess import scale_boxes + boxes = scale_boxes(boxes, 1, 1, input_h, input_w, False, False) + else: + boxes, scores, classes = np_outputs[0][0], np_outputs[1][0], np_outputs[2][0] + if bbox_normalization: + boxes = boxes / input_h + + boxes = np.array_split(boxes, 4, axis=1) + boxes = zip(*boxes) + + detections = [ + Detection(box, category, score, metadata) + for box, score, category in zip(boxes, scores, classes) + if score > threshold + ] + return detections + + +@lru_cache +def get_labels(): + labels = intrinsics.labels + + if intrinsics.ignore_dash_labels: + labels = [label for label in labels if label and label != "-"] + return labels + + +def draw_detections(jobs): + """Draw the detections for this request onto the ISP output.""" + labels = get_labels() + # Wait for result from child processes in the order submitted. + last_detections = [] + while (job := jobs.get()) is not None: + request, async_result = job + detections = async_result.get() + if detections is None: + detections = last_detections + last_detections = detections + with MappedArray(request, 'main') as m: + for detection in detections: + x, y, w, h = detection.box + label = f"{labels[int(detection.category)]} ({detection.conf:.2f})" + + # Calculate text size and position + (text_width, text_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + text_x = x + 5 + text_y = y + 15 + + # Create a copy of the array to draw the background with opacity + overlay = m.array.copy() + + # Draw the background rectangle on the overlay + cv2.rectangle(overlay, + (text_x, text_y - text_height), + (text_x + text_width, text_y + baseline), + (255, 255, 255), # Background color (white) + cv2.FILLED) + + alpha = 0.3 + cv2.addWeighted(overlay, alpha, m.array, 1 - alpha, 0, m.array) + + # Draw text on top of the background + cv2.putText(m.array, label, (text_x, text_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) + + # Draw detection box + cv2.rectangle(m.array, (x, y), (x + w, y + h), (0, 255, 0), thickness=2) + + if intrinsics.preserve_aspect_ratio: + b_x, b_y, b_w, b_h = imx500.get_roi_scaled(request) + color = (255, 0, 0) # red + cv2.putText(m.array, "ROI", (b_x + 5, b_y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1) + cv2.rectangle(m.array, (b_x, b_y), (b_x + b_w, b_y + b_h), (255, 0, 0, 0)) + + cv2.imshow('IMX500 Object Detection', m.array) + cv2.waitKey(1) + request.release() + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="Path of the model", + default="/usr/share/imx500-models/imx500_network_ssd_mobilenetv2_fpnlite_320x320_pp.rpk") + parser.add_argument("--fps", type=int, help="Frames per second") + parser.add_argument("--bbox-normalization", action=argparse.BooleanOptionalAction, help="Normalize bbox") + parser.add_argument("--threshold", type=float, default=0.55, help="Detection threshold") + parser.add_argument("--iou", type=float, default=0.65, help="Set iou threshold") + parser.add_argument("--max-detections", type=int, default=10, help="Set max detections") + parser.add_argument("--ignore-dash-labels", action=argparse.BooleanOptionalAction, help="Remove '-' labels ") + parser.add_argument("--postprocess", choices=["", "nanodet"], + default=None, help="Run post process of type") + parser.add_argument("-r", "--preserve-aspect-ratio", action=argparse.BooleanOptionalAction, + help="preserve the pixel aspect ratio of the input tensor") + parser.add_argument("--labels", type=str, + help="Path to the labels file") + parser.add_argument("--print-intrinsics", action="store_true", + help="Print JSON network_intrinsics then exit") + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + # This must be called before instantiation of Picamera2 + imx500 = IMX500(args.model) + intrinsics = imx500.network_intrinsics + if not intrinsics: + intrinsics = NetworkIntrinsics() + intrinsics.task = "object detection" + elif intrinsics.task != "object detection": + print("Network is not an object detection task", file=sys.stderr) + exit() + + # Override intrinsics from args + for key, value in vars(args).items(): + if key == 'labels' and value is not None: + with open(value, 'r') as f: + intrinsics.labels = f.read().splitlines() + elif hasattr(intrinsics, key) and value is not None: + setattr(intrinsics, key, value) + + # Defaults + if intrinsics.labels is None: + with open("assets/coco_labels.txt", "r") as f: + intrinsics.labels = f.read().splitlines() + intrinsics.update_with_defaults() + + if args.print_intrinsics: + print(intrinsics) + exit() + + picam2 = Picamera2(imx500.camera_num) + main = {'format': 'RGB888'} + config = picam2.create_preview_configuration(main, controls={"FrameRate": intrinsics.inference_rate}, buffer_count=12) + + imx500.show_network_fw_progress_bar() + picam2.start(config, show_preview=False) + if intrinsics.preserve_aspect_ratio: + imx500.set_auto_aspect_ratio() + + pool = multiprocessing.Pool(processes=4) + jobs = queue.Queue() + + thread = threading.Thread(target=draw_detections, args=(jobs,)) + thread.start() + + while True: + # The request gets released by handle_results + request = picam2.capture_request() + metadata = request.get_metadata() + if metadata: + async_result = pool.apply_async(parse_detections, (metadata,)) + jobs.put((request, async_result)) + else: + request.release() diff --git a/examples/imx500/imx500_pose_estimation_higherhrnet_demo.py b/examples/imx500/imx500_pose_estimation_higherhrnet_demo.py new file mode 100755 index 00000000..f443308d --- /dev/null +++ b/examples/imx500/imx500_pose_estimation_higherhrnet_demo.py @@ -0,0 +1,117 @@ +import argparse +import sys +import time + +import numpy as np + +from picamera2 import CompletedRequest, MappedArray, Picamera2 +from picamera2.devices.imx500 import IMX500, NetworkIntrinsics +from picamera2.devices.imx500.postprocess import COCODrawer +from picamera2.devices.imx500.postprocess_highernet import \ + postprocess_higherhrnet + +last_boxes = None +last_scores = None +last_keypoints = None +WINDOW_SIZE_H_W = (480, 640) + + +def ai_output_tensor_parse(metadata: dict): + """Parse the output tensor into a number of detected objects, scaled to the ISP out.""" + global last_boxes, last_scores, last_keypoints + np_outputs = imx500.get_outputs(metadata=metadata, add_batch=True) + if np_outputs is not None: + keypoints, scores, boxes = postprocess_higherhrnet(outputs=np_outputs, + img_size=WINDOW_SIZE_H_W, + img_w_pad=(0, 0), + img_h_pad=(0, 0), + detection_threshold=args.detection_threshold, + network_postprocess=True) + + if scores is not None and len(scores) > 0: + last_keypoints = np.reshape(np.stack(keypoints, axis=0), (len(scores), 17, 3)) + last_boxes = [np.array(b) for b in boxes] + last_scores = np.array(scores) + return last_boxes, last_scores, last_keypoints + + +def ai_output_tensor_draw(request: CompletedRequest, boxes, scores, keypoints, stream='main'): + """Draw the detections for this request onto the ISP output.""" + with MappedArray(request, stream) as m: + if boxes is not None and len(boxes) > 0: + drawer.annotate_image(m.array, boxes, scores, + np.zeros(scores.shape), keypoints, args.detection_threshold, + args.detection_threshold, request.get_metadata(), picam2, stream) + + +def picamera2_pre_callback(request: CompletedRequest): + """Analyse the detected objects in the output tensor and draw them on the main output image.""" + boxes, scores, keypoints = ai_output_tensor_parse(request.get_metadata()) + ai_output_tensor_draw(request, boxes, scores, keypoints) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="Path of the model", + default="/usr/share/imx500-models/imx500_network_higherhrnet_coco.rpk") + parser.add_argument("--fps", type=int, help="Frames per second") + parser.add_argument("--detection-threshold", type=float, default=0.3, + help="Post-process detection threshold") + parser.add_argument("--labels", type=str, + help="Path to the labels file") + parser.add_argument("--print-intrinsics", action="store_true", + help="Print JSON network_intrinsics then exit") + return parser.parse_args() + + +def get_drawer(): + categories = intrinsics.labels + categories = [c for c in categories if c and c != "-"] + return COCODrawer(categories, imx500, needs_rescale_coords=False) + + +if __name__ == "__main__": + args = get_args() + + # This must be called before instantiation of Picamera2 + imx500 = IMX500(args.model) + intrinsics = imx500.network_intrinsics + if not intrinsics: + intrinsics = NetworkIntrinsics() + intrinsics.task = "pose estimation" + elif intrinsics.task != "pose estimation": + print("Network is not a pose estimation task", file=sys.stderr) + exit() + + # Override intrinsics from args + for key, value in vars(args).items(): + if key == 'labels' and value is not None: + with open(value, 'r') as f: + intrinsics.labels = f.read().splitlines() + elif hasattr(intrinsics, key) and value is not None: + setattr(intrinsics, key, value) + + # Defaults + if intrinsics.inference_rate is None: + intrinsics.inference_rate = 10 + if intrinsics.labels is None: + with open("assets/coco_labels.txt", "r") as f: + intrinsics.labels = f.read().splitlines() + intrinsics.update_with_defaults() + + if args.print_intrinsics: + print(intrinsics) + exit() + + drawer = get_drawer() + + picam2 = Picamera2(imx500.camera_num) + config = picam2.create_preview_configuration(controls={'FrameRate': intrinsics.inference_rate}, buffer_count=12) + + imx500.show_network_fw_progress_bar() + picam2.start(config, show_preview=True) + imx500.set_auto_aspect_ratio() + picam2.pre_callback = picamera2_pre_callback + + while True: + time.sleep(0.5) diff --git a/examples/imx500/imx500_segmentation_demo.py b/examples/imx500/imx500_segmentation_demo.py new file mode 100755 index 00000000..4f3c1e77 --- /dev/null +++ b/examples/imx500/imx500_segmentation_demo.py @@ -0,0 +1,101 @@ +import argparse +import sys +import time +from typing import Dict + +import numpy as np + +from picamera2 import CompletedRequest, Picamera2 +from picamera2.devices import IMX500 +from picamera2.devices.imx500 import NetworkIntrinsics + +COLOURS = np.loadtxt("assets/colours.txt") + + +def create_and_draw_masks(request: CompletedRequest): + """Create masks from the output tensor and draw them on the main output image.""" + masks = create_masks(request) + draw_masks(masks) + + +def create_masks(request: CompletedRequest) -> Dict[int, np.ndarray]: + """Create masks from the output tensor, scaled to the ISP out.""" + res = {} + np_outputs = imx500.get_outputs(metadata=request.get_metadata()) + input_w, input_h = imx500.get_input_size() + if np_outputs is None: + return res + mask = np_outputs[0] + found_indices = np.unique(mask) + + for i in found_indices: + if i == 0: + continue + output_shape = [input_h, input_w, 4] + colour = [(0, 0, 0, 0), COLOURS[int(i)]] + colour[1][3] = 150 # update the alpha value here, to save setting it later + overlay = np.array(mask == i, dtype=np.uint8) + overlay = np.array(colour)[overlay].reshape(output_shape).astype(np.uint8) + # No need to resize the overlay, it will be stretched to the output window. + res[i] = overlay + return res + + +def draw_masks(masks: Dict[int, np.ndarray]): + """Draw the masks for this request onto the ISP output.""" + if not masks: + return + input_w, input_h = imx500.get_input_size() + output_shape = [input_h, input_w, 4] + overlay = np.zeros(output_shape, dtype=np.uint8) + if masks: + for v in masks.values(): + overlay += v + # Set Alphas and overlay + picam2.set_overlay(overlay) + + +def get_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="Path of the model", + default="/usr/share/imx500-models/imx500_network_deeplabv3plus.rpk") + parser.add_argument("--fps", type=int, help="Frames per second") + parser.add_argument("--print-intrinsics", action="store_true", + help="Print JSON network_intrinsics then exit") + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + # This must be called before instantiation of Picamera2 + imx500 = IMX500(args.model) + intrinsics = imx500.network_intrinsics + if not intrinsics: + intrinsics = NetworkIntrinsics() + intrinsics.task = "segmentation" + elif intrinsics.task != "segmentation": + print("Network is not a segmentation task", file=sys.stderr) + exit() + + # Override intrinsics from args + for key, value in vars(args).items(): + if hasattr(intrinsics, key) and value is not None: + setattr(intrinsics, key, value) + + # Defaults + intrinsics.update_with_defaults() + + if args.print_intrinsics: + print(intrinsics) + exit() + + picam2 = Picamera2(imx500.camera_num) + config = picam2.create_preview_configuration(controls={'FrameRate': intrinsics.inference_rate}, buffer_count=12) + imx500.show_network_fw_progress_bar() + picam2.start(config, show_preview=True) + picam2.pre_callback = create_and_draw_masks + + while True: + time.sleep(0.5) diff --git a/picamera2/devices/__init__.py b/picamera2/devices/__init__.py index a8ccb03b..5ef65f35 100644 --- a/picamera2/devices/__init__.py +++ b/picamera2/devices/__init__.py @@ -3,4 +3,5 @@ from .hailo import Hailo except ModuleNotFoundError: pass +from .imx500 import IMX500 from .imx708 import IMX708 diff --git a/picamera2/devices/imx500/__init__.py b/picamera2/devices/imx500/__init__.py new file mode 100644 index 00000000..2cabd5ee --- /dev/null +++ b/picamera2/devices/imx500/__init__.py @@ -0,0 +1,6 @@ +from .imx500 import IMX500, NetworkIntrinsics +from .postprocess_efficientdet_lite0 import \ + postprocess_efficientdet_lite0_detection +from .postprocess_nanodet import postprocess_nanodet_detection +from .postprocess_yolov5 import postprocess_yolov5_detection +from .postprocess_yolov8 import postprocess_yolov8_detection diff --git a/picamera2/devices/imx500/imx500.py b/picamera2/devices/imx500/imx500.py new file mode 100644 index 00000000..24748497 --- /dev/null +++ b/picamera2/devices/imx500/imx500.py @@ -0,0 +1,709 @@ +import ctypes +import fcntl +import io +import json +import multiprocessing +import os +import struct +import sys +import time +from typing import List, Optional + +import jsonschema +import numpy as np +from libarchive.read import fd_reader +from libcamera import Rectangle, Size +from tqdm import tqdm +from v4l2 import (VIDIOC_S_CTRL, VIDIOC_S_EXT_CTRLS, v4l2_control, + v4l2_ext_control, v4l2_ext_controls) + +from picamera2 import CompletedRequest, Picamera2 + +NETWORK_NAME_LEN = 64 +MAX_NUM_TENSORS = 8 +MAX_NUM_DIMENSIONS = 8 + +FW_LOADER_STAGE = 0 +FW_MAIN_STAGE = 1 +FW_NETWORK_STAGE = 2 + +NETWORK_FW_FD_CTRL_ID = 0x00982901 +ROI_CTRL_ID = 0x00982900 + + +# struct OutputTensorInfo from libcamera +class OutputTensorInfo(ctypes.LittleEndianStructure): + _fields_ = [ + ('tensor_data_num', ctypes.c_uint32), + ('num_dimensions', ctypes.c_uint32), + ('size', ctypes.c_uint16 * MAX_NUM_DIMENSIONS), + ] + + +# struct CnnOutputTensorInfoExported from libcamera +class CnnOutputTensorInfoExported(ctypes.LittleEndianStructure): + _fields_ = [ + ('network_name', ctypes.c_char * NETWORK_NAME_LEN), + ('num_tensors', ctypes.c_uint32), + ('info', OutputTensorInfo * MAX_NUM_TENSORS) + ] + + +class NetworkIntrinsics: + def __init__(self, val=None): + self.__intrinsics: Optional[dict] = None + self.__schema = { + "$schema": "https://json-schema.org/draft-07/schema", + "title": "network_intrinsics", + "type": "object", + "properties": { + "task": { + "type": "string", + "enum": ["classification", "object detection", "pose estimation", "segmentation"], + "description": "Network task", + }, + "inference_rate": {"type": "number", "minimum": 0}, + "cpu": { + "type": "object", + "properties": { + "bbox_normalization": {"type": "boolean"}, + "softmax": {"type": "boolean"}, + "post_processing": {"type": "string"}, + }, + }, + "input_aspect_ratio": { + "type": "object", + "properties": { + "width": {"type": "integer", "exclusiveMinimum": 0}, + "height": {"type": "integer", "exclusiveMinimum": 0}, + }, + "required": ["width", "height"], + }, + "classes": { + "type": "object", + "properties": { + "labels": {"type": "array", "items": {"type": "string"}}, + "ignore_undefined": {"type": "boolean"}, + }, + }, + }, + } + if val is not None: + jsonschema.validate(val, self.__schema) + self.__intrinsics = val + + self.__defaults = {'inference_rate': 30.0} + jsonschema.validate(self.__defaults, self.__schema | {'additionalProperties': False}) + + @property + def intrinsics(self) -> Optional[dict]: + return self.__intrinsics + + @intrinsics.setter + def intrinsics(self, val): + jsonschema.validate(val, self.__schema) + self.__intrinsics = val + + def __repr__(self): + return json.dumps(self.__intrinsics) if self.__intrinsics else "" + + def __top_level_validated_insert(self, val: dict): + jsonschema.validate(val, self.__schema | {'additionalProperties': False}) + self.__intrinsics = self.__intrinsics | val if self.__intrinsics else val + + def __intrinsics_has_key(self, key: str) -> bool: + return key in self.__intrinsics if self.__intrinsics else False + + def __intrinsics_get_key(self, key, default=None): + return self.__intrinsics.get(key, default) if self.__intrinsics else default + + def update_with_defaults(self): + # Updates intrinsics with default settings (but does not overwrite) + if not self.__intrinsics: + self.__intrinsics = {} + self.__intrinsics = self.__defaults | self.__intrinsics + + @property + def task(self) -> Optional[str]: + return self.__intrinsics_get_key('task') + + @task.setter + def task(self, val: str): + self.__top_level_validated_insert({'task': val}) + + @property + def inference_rate(self) -> Optional[float]: + return self.__intrinsics_get_key('inference_rate') + + @inference_rate.setter + def inference_rate(self, val: float): + if val < 0: + if self.__intrinsics is not None: + self.__intrinsics.pop('inference_rate', None) + else: + self.__top_level_validated_insert({'inference_rate': val}) + + @property + def fps(self) -> Optional[float]: + # @deprecated("Prefer inference_rate") + return self.inference_rate + + @fps.setter + def fps(self, val: Optional[float]): + # @deprecated("Prefer inference_rate") + self.inference_rate = val + + def __get_cpu(self, key: str): + return self.__intrinsics['cpu'].get(key, None) if self.__intrinsics_has_key('cpu') else None + + def __set_cpu(self, val: dict): + jsonschema.validate({'cpu': val}, self.__schema | {'additionalProperties': False}) + cpu = self.__intrinsics_get_key('cpu', {}) | val + if self.__intrinsics: + self.__intrinsics['cpu'] = cpu + else: + self.__intrinsics = {'cpu': cpu} + + @property + def bbox_normalization(self) -> Optional[bool]: + return self.__get_cpu('bbox_normalization') + + @bbox_normalization.setter + def bbox_normalization(self, val: Optional[bool]): + if val is None: + return + + if val: + self.__set_cpu({'bbox_normalization': val}) + elif self.__intrinsics_has_key('cpu'): + self.__intrinsics['cpu'].pop('bbox_normalization', None) + + if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0: + self.__intrinsics.pop('cpu') + + @property + def softmax(self) -> Optional[bool]: + return self.__get_cpu('softmax') + + @softmax.setter + def softmax(self, val: Optional[bool]): + if val is None: + return + + if val: + self.__set_cpu({'softmax': val}) + elif self.__intrinsics_has_key('cpu'): + self.__intrinsics['cpu'].pop('softmax', None) + + if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0: + self.__intrinsics.pop('cpu') + + @property + def postprocess(self) -> Optional[str]: + return self.__get_cpu('post_processing') + + @postprocess.setter + def postprocess(self, val: str): + if val != "": + self.__set_cpu({'post_processing': val}) + elif self.__intrinsics_has_key('cpu'): + self.__intrinsics['cpu'].pop('post_processing', None) + + if self.__intrinsics_has_key('cpu') and len(self.__intrinsics['cpu']) == 0: + self.__intrinsics.pop('cpu') + + @property + def preserve_aspect_ratio(self) -> Optional[bool]: + if not self.__intrinsics_has_key('input_aspect_ratio'): + return None + ar = self.__intrinsics['input_aspect_ratio'] + return ar['width'] == ar['height'] + + @preserve_aspect_ratio.setter + def preserve_aspect_ratio(self, val: Optional[bool]): + if val is None: + return + + if val: + iar = {'input_aspect_ratio': {'width': 1, 'height': 1}} + self.__top_level_validated_insert(iar) + elif self.__intrinsics_has_key('input_aspect_ratio'): + self.__intrinsics.pop('input_aspect_ratio') + + @property + def labels(self) -> Optional[List[str]]: + return self.__intrinsics['classes'].get('labels', None) if self.__intrinsics_has_key('classes') else None + + @labels.setter + def labels(self, val: List[str]): + if len(val) != 0: + classes = {'labels': val} + jsonschema.validate({'classes': classes}, self.__schema | {'additionalProperties': False}) + + classes = self.__intrinsics_get_key('classes', {}) | classes + if self.__intrinsics: + self.__intrinsics['classes'] = classes + else: + self.__intrinsics = {'classes': classes} + elif self.__intrinsics_has_key('classes'): + self.__intrinsics['classes'].pop('labels', None) + if len(self.__intrinsics['classes']) == 0: + self.__intrinsics.pop('classes') + + @property + def ignore_dash_labels(self) -> Optional[bool]: + return self.__intrinsics['classes'].get('ignore_undefined', None) if self.__intrinsics_has_key('classes') else None + + @ignore_dash_labels.setter + def ignore_dash_labels(self, val: Optional[bool]): + if val is None: + return + + if val: + iu = {'ignore_undefined': val} + jsonschema.validate({'classes': iu}, self.__schema | {'additionalProperties': False}) + + classes = {'classes': self.__intrinsics_get_key('classes', {}) | iu} + self.__intrinsics = self.__intrinsics | classes if self.__intrinsics else classes + elif self.__intrinsics_has_key('classes'): + self.__intrinsics['classes'].pop('ignore_undefined', None) + if len(self.__intrinsics['classes']) == 0: + self.__intrinsics.pop('classes') + + +class IMX500: + def __init__(self, network_file: str, camera_id: str = ''): + self.device_fd = None + self.fw_progress = None + self.fw_progress_chunk = None + self.__cfg = {'network_file': network_file, 'input_tensor': {}} + + imx500_device_id = None + spi_device_id = None + for i in range(32): + test_dir = f'/sys/class/video4linux/v4l-subdev{i}/device' + module_dir = f'{test_dir}/driver/module' + id_dir = f'{test_dir}/of_node' + if os.path.exists(module_dir) and os.path.islink(module_dir) and os.path.islink(id_dir) \ + and 'imx500' in os.readlink(module_dir): + if camera_id == '' or (camera_id in os.readlink(id_dir)): + self.device_fd = open(f'/dev/v4l-subdev{i}', 'rb+', buffering=0) + imx500_device_id = os.readlink(test_dir).split('/')[-1] + spi_device_id = imx500_device_id.replace('001a', '0040') + camera_info = Picamera2.global_camera_info() + self.__camera_num = next((c['Num'] for c in camera_info if c['Model'] == 'imx500' + and c['Id'] in os.readlink(id_dir))) + break + + if self.device_fd is None: + raise RuntimeError('IMX500: Requested camera dev-node not found') + + # Progress status specific debugfs entries. + if imx500_device_id: + self.fw_progress = open(f'/sys/kernel/debug/imx500-fw:{imx500_device_id}/fw_progress', 'r') + if spi_device_id: + self.fw_progress_chunk = open(f'/sys/kernel/debug/rp2040-spi:{spi_device_id}/transfer_progress', 'r') + + if self.config['network_file'] != '': + self.__set_network_firmware(os.path.abspath(self.config['network_file'])) + self.__ni_from_network(os.path.abspath(self.config['network_file'])) + + if 'norm_val' not in self.__cfg['input_tensor']: + self.__cfg['input_tensor']['norm_val'] = [-2048, -2048, -2048] + if 'norm_shift' not in self.__cfg: + self.__cfg['input_tensor']['norm_shift'] = [4, 4, 4] + if 'div_val' not in self.__cfg: + self.__cfg['input_tensor']['div_val'] = [1024, 1024, 1024] + if 'div_shift' not in self.__cfg: + self.__cfg['input_tensor']['div_shift'] = 6 + + full_sensor = self.__get_full_sensor_resolution() + self.set_inference_roi_abs(full_sensor.to_tuple()) + + @staticmethod + def __get_full_sensor_resolution(): + """Full sensor resolution as a Rectangle object.""" + return Rectangle(0, 0, 4056, 3040) + + def __del__(self): + if self.device_fd: + self.device_fd.close() + + @property + def camera_num(self): + return self.__camera_num + + @property + def config(self) -> dict: + return self.__cfg + + @property + def network_intrinsics(self) -> Optional[NetworkIntrinsics]: + return self.__cfg.get('intrinsics', None) + + def convert_inference_coords(self, coords: tuple, metadata: dict, picam2: Picamera2, stream='main') -> tuple: + """Convert relative inference coordinates into the output image coordinates space.""" + isp_output_size = Size(*picam2.camera_configuration()[stream]['size']) + sensor_output_size = Size(*picam2.camera_configuration()['raw']['size']) + scaler_crop = Rectangle(*metadata['ScalerCrop']) + + y0, x0, y1, x1 = coords + full_sensor = self.__get_full_sensor_resolution() + width, height = full_sensor.size.to_tuple() + obj = Rectangle( + *np.maximum( + np.array([x0 * width, y0 * height, (x1 - x0) * width, (y1 - y0) * height]), + 0, + ).astype(np.int32) + ) + out = self.__get_obj_scaled(obj, isp_output_size, scaler_crop, sensor_output_size) + return out.to_tuple() + + def get_fw_upload_progress(self, stage_req) -> tuple: + """Returns the current progress of the fw upload in the form of (current, total).""" + progress_block = 0 + progress_chunk = 0 + size = 0 + stage = 0 + + if self.fw_progress: + self.fw_progress.seek(0) + progress = self.fw_progress.readline().strip().split() + stage = int(progress[0]) + progress_block = int(progress[1]) + size = int(progress[2]) + + if self.fw_progress_chunk: + self.fw_progress_chunk.seek(0) + progress_chunk = int(self.fw_progress_chunk.readline().strip()) + + if stage == stage_req: + return (min(progress_block + progress_chunk, size), size) + else: + return (0, 0) + + def show_network_fw_progress_bar(self): + p = multiprocessing.Process(target=self.__do_progress_bar, + args=(FW_NETWORK_STAGE, 'Network Firmware Upload')) + p.start() + p.join(0) + + def __do_progress_bar(self, stage_req, title): + with tqdm(unit='bytes', unit_scale=True, unit_divisor=1024, desc=title, leave=True) as t: + last_update = 0 + while True: + current, total = self.get_fw_upload_progress(stage_req) + if total: + t.total = total + t.update(current - last_update) + last_update = current + if current > 0.95 * total: + t.update(total - last_update) + break + time.sleep(0.5) + + def get_roi_scaled(self, request: CompletedRequest, stream="main") -> tuple: + """Get the region of interest (ROI) in output image coordinates space.""" + picam2 = request.picam2 + isp_output_size = self.get_isp_output_size(picam2, stream) + sensor_output_size = self.get_isp_output_size(picam2, 'raw') + scaler_crop = Rectangle(*request.get_metadata()['ScalerCrop']) + obj = self.__get_full_sensor_resolution() + roi = self.__get_obj_scaled(obj, isp_output_size, scaler_crop, sensor_output_size) + return roi.to_tuple() + + @staticmethod + def get_isp_output_size(picam2, stream="main") -> tuple: + return Size(*picam2.camera_configuration()[stream]['size']) + + def __get_obj_scaled(self, obj, isp_output_size, scaler_crop, sensor_output_size) -> Rectangle: + """Scale the object coordinates based on the camera configuration and sensor properties.""" + full_sensor = self.__get_full_sensor_resolution() + width, height = full_sensor.size.to_tuple() + sensor_crop = scaler_crop.scaled_by(sensor_output_size, full_sensor.size) + + # Make sure the object is bound to the user requested ROI. + if 'roi' in self.config and self.config['roi'] != Rectangle(0, 0, 0, 0): + obj = obj.bounded_to(self.config['roi']) + + obj_sensor = obj.scaled_by(sensor_output_size, Size(width, height)) + obj_bound = obj_sensor.bounded_to(sensor_crop) + obj_translated = obj_bound.translated_by(-sensor_crop.topLeft) + obj_scaled = obj_translated.scaled_by(isp_output_size, sensor_crop.size) + return obj_scaled + + def get_input_size(self) -> tuple: + """Get the model input tensor size as (width, height).""" + return self.config['input_tensor_size'] + + def input_tensor_image(self, input_tensor): + """Convert input tensor in planar format to interleaved RGB.""" + width = self.config['input_tensor']['width'] + height = self.config['input_tensor']['height'] + r1 = np.array(input_tensor, dtype=np.uint8).astype(np.int32).reshape((3,) + (height, width)) + r1 = r1[(2, 1, 0), :, :] + norm_val = self.config['input_tensor']['norm_val'] + norm_shift = self.config['input_tensor']['norm_shift'] + div_val = self.config['input_tensor']['div_val'] + div_shift = self.config['input_tensor']['div_shift'] + for i in [0, 1, 2]: + r1[i] = ((((r1[i] << norm_shift[i]) - norm_val[i]) << div_shift) // div_val[i]) & 0xff + + return np.transpose(r1, (1, 2, 0)).astype(np.uint8) + + def get_outputs(self, metadata: dict, add_batch=False) -> Optional[list[np.ndarray]]: + """Get the model outputs.""" + output_tensor = metadata.get('CnnOutputTensor') + if not output_tensor: + return None + + np_output = np.fromiter(output_tensor, dtype=np.float32) + output_shapes = self.get_output_shapes(metadata) + offset = 0 + outputs = [] + for tensor_shape in output_shapes: + size = np.prod(tensor_shape) + reshaped_tensor = np_output[offset:offset + size].reshape(tensor_shape, order='F') + if add_batch: + reshaped_tensor = np.expand_dims(reshaped_tensor, 0) + outputs.append(reshaped_tensor) + offset += size + return outputs + + def get_output_shapes(self, metadata: dict) -> list[tuple[int]]: + """Get the model output shapes if no output return empty list.""" + output_tensor_info = metadata.get('CnnOutputTensorInfo') + if not output_tensor_info: + return [] + output_tensor_info = self.__get_output_tensor_info(output_tensor_info)['info'] + return [o['size'] for o in output_tensor_info] + + def set_inference_roi_abs(self, roi: tuple): + """ + Set the absolute inference image crop. + + Specify an absolute region of interest in the form a (left, top, width, height) crop for the input inference + image. The co-ordinates are based on the full sensor resolution. + """ + roi = Rectangle(*roi) + roi = roi.bounded_to(self.__get_full_sensor_resolution()) + + r = (ctypes.c_uint32 * 4)() + r[0] = roi.x + r[1] = roi.y + r[2] = roi.width + r[3] = roi.height + + c = (v4l2_ext_control * 1)() + c[0].p_u32 = r + c[0].id = ROI_CTRL_ID + c[0].size = 16 + + ctrl = v4l2_ext_controls() + ctrl.count = 1 + ctrl.controls = c + + try: + fcntl.ioctl(self.device_fd, VIDIOC_S_EXT_CTRLS, ctrl) + self.__cfg['roi'] = roi + except OSError as err: + print(f'IMX500: Unable to set ROI control in the device driver: {err}') + + def set_inference_aspect_ratio(self, aspect_ratio: tuple): + """ + Set the aspect ratio of the inference image. + + Specify a pixel aspect ratio needed for the input inference image relative to the full sensor resolution. + This simply calculates an ROI based on a centre crop and calls set_inference_roi_abs(). + """ + f = self.__get_full_sensor_resolution() + r = f.size.bounded_to_aspect_ratio(Size(aspect_ratio[0], aspect_ratio[1])) + r = r.centered_to(f.center).enclosed_in(f) + self.set_inference_roi_abs(r.to_tuple()) + + def set_auto_aspect_ratio(self): + """Set the inference image crop to presereve the input tensor aspect ratio.""" + self.set_inference_aspect_ratio(self.config['input_tensor_size']) + + def __get_output_tensor_info(self, tensor_info) -> dict: + """Return the network string along with a list of output tensor parameters.""" + if type(tensor_info) not in [bytes, bytearray]: + tensor_info = bytes(tensor_info) + + size = ctypes.sizeof(CnnOutputTensorInfoExported) + if len(tensor_info) != size: + raise ValueError(f'tensor info length {len(tensor_info)} does not match expected size {size}') + + # Create an instance of the struct and copy data into it + parsed = CnnOutputTensorInfoExported() + ctypes.memmove(ctypes.addressof(parsed), tensor_info, size) + + result = { + 'network_name': parsed.network_name.decode('utf-8').strip('\x00'), + 'num_tensors': parsed.num_tensors, + 'info': [] + } + + for t in parsed.info[0:parsed.num_tensors]: + info = { + 'tensor_data_num': t.tensor_data_num, + 'num_dimensions': t.num_dimensions, + 'size': list(t.size)[0:t.num_dimensions], + } + result['info'].append(info) + + return result + + def __get_input_tensor_info(self, tensor_info) -> tuple[str, int, int, int]: + """Return the input tensor parameters in the form (network_name, width, height, num_channels).""" + NETWORK_NAME_LEN = 64 + tensor_fmt = f'{NETWORK_NAME_LEN}sIII' + + if type(tensor_info) not in [bytes, bytearray]: + tensor_info = bytes(tensor_info) + + network_name, width, height, num_channels = struct.unpack(tensor_fmt, tensor_info) + network_name = network_name.decode('utf-8').rstrip('\0') + return (network_name, width, height, num_channels) + + @staticmethod + def get_kpi_info(metadata: dict) -> Optional[tuple[float, float]]: + """Return the KPI parameters in the form (dnn_runtime, dsp_runtime).""" + kpi_info = metadata.get('CnnKpiInfo') + if kpi_info is None: + return None + if type(kpi_info) not in [bytes, bytearray]: + kpi_info = bytes(kpi_info) + + dnn_runtime, dsp_runtime = struct.unpack('II', kpi_info) + return dnn_runtime / 1000, dsp_runtime / 1000 + + def __set_network_firmware(self, network_filename: str): + """Provides a firmware rpk file to upload to the IMX500. This must be called before Picamera2 is configured.""" + if not os.path.isfile(network_filename): + raise RuntimeError(f'Firmware file {network_filename} does not exist.') + + fd = os.open(network_filename, os.O_RDONLY) + if fd: + ctrl = v4l2_control() + ctrl.id = NETWORK_FW_FD_CTRL_ID + ctrl.value = fd + + try: + fcntl.ioctl(self.device_fd, VIDIOC_S_CTRL, ctrl) + print('\n------------------------------------------------------------------------------------------------------------------\n' # noqa + 'NOTE: Loading network firmware onto the IMX500 can take several minutes, please do not close down the application.' # noqa + '\n------------------------------------------------------------------------------------------------------------------\n', file=sys.stderr) # noqa + except OSError as err: + raise RuntimeError(f'IMX500: Unable to set network firmware {network_filename}: {err}') + finally: + os.close(fd) + + def __ni_from_network(self, network_filename: str): + """Extracts 'network_info.txt' from CPIO-archive appended to the network rpk.""" + with open(network_filename, 'rb') as fp: + fw = memoryview(fp.read()) + + # Iterate through network firmware discarding blocks + cpio_offset = 0 + while True: + # Parse header (+ current block size) + (magic, size) = struct.unpack('>4sI', fw[:8]) + if not magic == b'9464': + break + fw = fw[size + 60:] + # Ensure footer is as expected + (magic,) = struct.unpack('4s', fw[:4]) + if not magic == b'3695': + raise RuntimeError(f'No matching footer found in firmware file {network_filename}') + fw = fw[4:] + cpio_offset += size + 64 + + cpio_fd = os.open(network_filename, os.O_RDONLY) + os.lseek(cpio_fd, cpio_offset, os.SEEK_SET) + + with fd_reader(cpio_fd) as archive: + for entry in archive: + if 'network_info.txt' == str(entry): + self.__cfg['network_info_raw'] = b''.join(entry.get_blocks()) + elif 'network_intrinsics' == str(entry): + self.__cfg['intrinsics'] = NetworkIntrinsics(json.loads(b''.join(entry.get_blocks()))) + + os.close(cpio_fd) + + if 'network_info_raw' not in self.__cfg: + return + + res = {} + buf = io.StringIO(self.__cfg['network_info_raw'].decode('ascii')) + for line in buf: + key, value = line.strip().split('=') + if key == 'networkID': + nid: int = 0 + for idx, x in enumerate(value): + nid |= (ord(x) - ord('0')) << (20 - idx * 4) + res[key] = nid + if key == 'apParamSize': + res[key] = int(value) + if key == 'networkNum': + res[key] = int(value) + + res['network'] = {} + networks = self.__cfg['network_info_raw'].decode('ascii').split('networkOrdinal=')[1:] + for nw in networks: + buf = io.StringIO(nw) + nw_idx = int(buf.readline()) + nw_properties = {} + for line in buf: + key, value = line.strip().split('=') + nw_properties[key] = value + res['network'][nw_idx] = nw_properties + + if len(res['network']) != res['networkNum']: + raise RuntimeError('Insufficient networkNum settings in network_info.txt') + + self.__cfg['network_info'] = res + + # Extract some input tensor config params + self.__cfg['input_tensor']['width'] = int(res['network'][0]['inputTensorWidth']) + self.__cfg['input_tensor']['height'] = int(res['network'][0]['inputTensorHeight']) + self.__cfg['input_tensor_size'] = (self.config['input_tensor']['width'], + self.config['input_tensor']['height']) + + input_format = self.__cfg['network_info']['network'][0]['inputTensorFormat'] + inputTensorNorm_K03 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K03'], 0) + inputTensorNorm_K13 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K13'], 0) + inputTensorNorm_K23 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K23'], 0) + inputTensorNorm_K00 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K00'], 0) + inputTensorNorm_K22 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K22'], 0) + inputTensorNorm_K02 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K02'], 0) + inputTensorNorm_K20 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K20'], 0) + inputTensorNorm_K11 = int(self.__cfg['network_info']['network'][0]['inputTensorNorm_K11'], 0) + + self.__cfg['input_tensor']['input_format'] = input_format + + if input_format == 'RGB' or input_format == 'BGR': + norm_val_0 = \ + inputTensorNorm_K03 if ((inputTensorNorm_K03 >> 12) & 1) == 0 else -((~inputTensorNorm_K03 + 1) & 0x1fff) + norm_val_1 = \ + inputTensorNorm_K13 if ((inputTensorNorm_K13 >> 12) & 1) == 0 else -((~inputTensorNorm_K13 + 1) & 0x1fff) + norm_val_2 = \ + inputTensorNorm_K23 if ((inputTensorNorm_K23 >> 12) & 1) == 0 else -((~inputTensorNorm_K23 + 1) & 0x1fff) + norm_val = [norm_val_0, norm_val_1, norm_val_2] + self.__cfg['input_tensor']['norm_val'] = norm_val + norm_shift = [4, 4, 4] + self.__cfg['input_tensor']['norm_shift'] = norm_shift + if input_format == 'RGB': + div_val_0 = \ + inputTensorNorm_K00 if ((inputTensorNorm_K00 >> 11) & 1) == 0 else -((~inputTensorNorm_K00 + 1) & 0x0fff) + div_val_2 =\ + inputTensorNorm_K22 if ((inputTensorNorm_K22 >> 11) & 1) == 0 else -((~inputTensorNorm_K22 + 1) & 0x0fff) + else: + div_val_0 = \ + inputTensorNorm_K02 if ((inputTensorNorm_K02 >> 11) & 1) == 0 else -((~inputTensorNorm_K02 + 1) & 0x0fff) + div_val_2 = \ + inputTensorNorm_K20 if ((inputTensorNorm_K20 >> 11) & 1) == 0 else -((~inputTensorNorm_K20 + 1) & 0x0fff) + div_val_1 = \ + inputTensorNorm_K11 if ((inputTensorNorm_K11 >> 11) & 1) == 0 else -((~inputTensorNorm_K11 + 1) & 0x0fff) + self.__cfg['input_tensor']['div_val'] = [div_val_0, div_val_1, div_val_2] + self.__cfg['input_tensor']['div_shift'] = 6 diff --git a/picamera2/devices/imx500/postprocess.py b/picamera2/devices/imx500/postprocess.py new file mode 100644 index 00000000..ee1c825e --- /dev/null +++ b/picamera2/devices/imx500/postprocess.py @@ -0,0 +1,414 @@ +""" +This code is based on multiple sources: + +https://github.com/rbgirshick/fast-rcnn +https://github.com/ultralytics/ultralytics +https://github.com/see--/keras-centernet +https://github.com/stefanopini/simple-HigherHRNet +""" + +from enum import Enum +from typing import List + +import cv2 +import numpy as np + +from picamera2 import Picamera2 + + +def nms(dets: np.ndarray, scores: np.ndarray, iou_thres: float = 0.55, max_out_dets: int = 50) -> List[int]: + """ + Perform Non-Maximum Suppression (NMS) on detected bounding boxes. + + Args: + dets (np.ndarray): Array of bounding box coordinates of shape (N, 4) representing [y1, x1, y2, x2]. + scores (np.ndarray): Array of confidence scores associated with each bounding box. + iou_thres (float, optional): IoU threshold for NMS. Default is 0.5. + max_out_dets (int, optional): Maximum number of output detections to keep. Default is 300. + + Returns: + List[int]: List of indices representing the indices of the bounding boxes to keep after NMS. + + """ + y1, x1 = dets[:, 0], dets[:, 1] + y2, x2 = dets[:, 2], dets[:, 3] + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= iou_thres)[0] + order = order[inds + 1] + + return keep[:max_out_dets] + + +def combined_nms(batch_boxes, batch_scores, iou_thres: float = 0.65, conf: float = 0.55, max_out_dets: int = 50): + nms_results = [] + for boxes, scores in zip(batch_boxes, batch_scores): + xc = np.argmax(scores, 1) + xs = np.amax(scores, 1) + x = np.concatenate([boxes, np.expand_dims(xs, 1), np.expand_dims(xc, 1)], 1) + + xi = xs > conf + x = x[xi] + + x = x[np.argsort(-x[:, 4])[:8400]] + scores = x[:, 4] + x[..., :4] = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H) + offset = x[:, 5] * 640 + boxes = x[..., :4] + np.expand_dims(offset, 1) + + # Original post-processing part + valid_indexs = nms(boxes, scores, iou_thres=iou_thres, max_out_dets=max_out_dets) + x = x[valid_indexs] + nms_classes = x[:, 5] + nms_bbox = x[:, :4] + nms_scores = x[:, 4] + + nms_results.append((nms_bbox, nms_scores, nms_classes)) + + return nms_results + + +def combined_nms_seg(batch_boxes, batch_scores, batch_masks, iou_thres: float = 0.5, conf: float = 0.001, + max_out_dets: int = 300): + nms_results = [] + for boxes, scores, masks in zip(batch_boxes, batch_scores, batch_masks): + # Compute maximum scores and corresponding class indices + class_indices = np.argmax(scores, axis=1) + max_scores = np.amax(scores, axis=1) + detections = np.concatenate([boxes, np.expand_dims(max_scores, axis=1), np.expand_dims(class_indices, axis=1)], + axis=1) + + # Swap the position of the two dimensions (32, 8400) to (8400, 32) + masks = np.transpose(masks, (1, 0)) + # Filter out detections below the confidence threshold + valid_detections = max_scores > conf + + if np.all(valid_detections is False): + nms_results.append((np.ndarray(0), np.ndarray(0), np.ndarray(0), np.ndarray(0))) + else: + + detections = detections[valid_detections] + masks = masks[valid_detections] + + # Sort detections by score in descending order + sorted_indices = np.argsort(-detections[:, 4]) + detections = detections[sorted_indices] + masks = masks[sorted_indices] + + detections[..., :4] = convert_to_ymin_xmin_ymax_xmax_format(detections[..., :4], BoxFormat.XC_YC_W_H) + + # Perform class-wise NMS + unique_classes = np.unique(detections[:, 5]) + final_indices = [] + + for cls in unique_classes: + cls_indices = np.where(detections[:, 5] == cls)[0] + cls_boxes = detections[cls_indices, :4] + cls_scores = detections[cls_indices, 4] + cls_valid_indices = nms(cls_boxes, cls_scores, iou_thres=iou_thres, max_out_dets=max_out_dets) + final_indices.extend(cls_indices[cls_valid_indices]) + + final_indices = np.array(final_indices) + final_detections = detections[final_indices] + final_masks = masks[final_indices] + + # Extract class indices, bounding boxes, and scores + nms_classes = final_detections[:, 5] + nms_bbox = final_detections[:, :4] + nms_scores = final_detections[:, 4] + + # Append results including masks + nms_results.append((nms_bbox, nms_scores, nms_classes, final_masks)) + return nms_results + + +class BoxFormat(Enum): + YMIM_XMIN_YMAX_XMAX = 'ymin_xmin_ymax_xmax' + XMIM_YMIN_XMAX_YMAX = 'xmin_ymin_xmax_ymax' + XMIN_YMIN_W_H = 'xmin_ymin_width_height' + XC_YC_W_H = 'xc_yc_width_height' + + +def convert_to_ymin_xmin_ymax_xmax_format(boxes, orig_format: BoxFormat): + """ + Changes the box from one format to another (XMIN_YMIN_W_H --> YMIM_XMIN_YMAX_XMAX ) + + Also support in same format mode (returns the same format) + + :param boxes: + :param orig_format: + :return: box in format YMIM_XMIN_YMAX_XMAX + """ + if len(boxes) == 0: + return boxes + elif orig_format == BoxFormat.YMIM_XMIN_YMAX_XMAX: + return boxes + elif orig_format == BoxFormat.XMIN_YMIN_W_H: + boxes[:, 2] += boxes[:, 0] # convert width to xmax + boxes[:, 3] += boxes[:, 1] # convert height to ymax + boxes[:, 0], boxes[:, 1] = boxes[:, 1], boxes[:, 0].copy() # swap xmin, ymin columns + boxes[:, 2], boxes[:, 3] = boxes[:, 3], boxes[:, 2].copy() # swap xmax, ymax columns + return boxes + elif orig_format == BoxFormat.XMIM_YMIN_XMAX_YMAX: + boxes[:, 0], boxes[:, 1] = boxes[:, 1], boxes[:, 0].copy() # swap xmin, ymin columns + boxes[:, 2], boxes[:, 3] = boxes[:, 3], boxes[:, 2].copy() # swap xmax, ymax columns + return boxes + elif orig_format == BoxFormat.XC_YC_W_H: + new_boxes = np.copy(boxes) + new_boxes[:, 0] = boxes[:, 1] - boxes[:, 3] / 2 # top left y + new_boxes[:, 1] = boxes[:, 0] - boxes[:, 2] / 2 # top left x + new_boxes[:, 2] = boxes[:, 1] + boxes[:, 3] / 2 # bottom right y + new_boxes[:, 3] = boxes[:, 0] + boxes[:, 2] / 2 # bottom right x + return new_boxes + else: + raise Exception("Unsupported boxes format") + + +def clip_boxes(boxes: np.ndarray, h: int, w: int) -> np.ndarray: + """ + Clip bounding boxes to stay within the image boundaries. + + Args: + boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max]. + h (int): Height of the image. + w (int): Width of the image. + + Returns: + numpy.ndarray: Clipped bounding boxes. + """ + boxes[..., 0] = np.clip(boxes[..., 0], a_min=0, a_max=h) + boxes[..., 1] = np.clip(boxes[..., 1], a_min=0, a_max=w) + boxes[..., 2] = np.clip(boxes[..., 2], a_min=0, a_max=h) + boxes[..., 3] = np.clip(boxes[..., 3], a_min=0, a_max=w) + return boxes + + +def scale_boxes(boxes: np.ndarray, h_image: int, w_image: int, h_model: int, w_model: int, preserve_aspect_ratio: bool, + normalized: bool = True) -> np.ndarray: + """ + Scale and offset bounding boxes based on model output size and original image size. + + Args: + boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max]. + h_image (int): Original image height. + w_image (int): Original image width. + h_model (int): Model output height. + w_model (int): Model output width. + preserve_aspect_ratio (bool): Whether to preserve image aspect ratio during scaling + + Returns: + numpy.ndarray: Scaled and offset bounding boxes. + """ + deltaH, deltaW = 0, 0 + H, W = h_model, w_model + scale_H, scale_W = h_image / H, w_image / W + + if preserve_aspect_ratio: + scale_H = scale_W = max(h_image / H, w_image / W) + H_tag = int(np.round(h_image / scale_H)) + W_tag = int(np.round(w_image / scale_W)) + deltaH, deltaW = int((H - H_tag) / 2), int((W - W_tag) / 2) + + nh, nw = (H, W) if normalized else (1, 1) + + # Scale and offset boxes + # [y_min, x_min, y_max, x_max]. + boxes[..., 0] = (boxes[..., 0] * nw - deltaW) * scale_W + boxes[..., 1] = (boxes[..., 1] * nh - deltaH) * scale_H + boxes[..., 2] = (boxes[..., 2] * nw - deltaW) * scale_W + boxes[..., 3] = (boxes[..., 3] * nh - deltaH) * scale_H + + # Clip boxes + boxes = clip_boxes(boxes, h_image, w_image) + + return boxes + + +def scale_coords(kpts: np.ndarray, h_image: int, w_image: int, h_model: int, w_model: int, + preserve_aspect_ratio: bool) -> np.ndarray: + """ + Scale and offset keypoints based on model output size and original image size. + + Args: + kpts (numpy.ndarray): Array of bounding keypoints in format [..., 17, 3] where the last dim is (x, y, visible). + h_image (int): Original image height. + w_image (int): Original image width. + h_model (int): Model output height. + w_model (int): Model output width. + preserve_aspect_ratio (bool): Whether to preserve image aspect ratio during scaling + + Returns: + numpy.ndarray: Scaled and offset bounding boxes. + """ + deltaH, deltaW = 0, 0 + H, W = h_model, w_model + scale_H, scale_W = h_image / H, w_image / W + + if preserve_aspect_ratio: + scale_H = scale_W = max(h_image / H, w_image / W) + H_tag = int(np.round(h_image / scale_H)) + W_tag = int(np.round(w_image / scale_W)) + deltaH, deltaW = int((H - H_tag) / 2), int((W - W_tag) / 2) + + # Scale and offset boxes + kpts[..., 0] = (kpts[..., 0] - deltaH) * scale_H + kpts[..., 1] = (kpts[..., 1] - deltaW) * scale_W + + # Clip boxes + kpts = clip_coords(kpts, h_image, w_image) + + return kpts + + +def clip_coords(kpts: np.ndarray, h: int, w: int) -> np.ndarray: + """ + Clip keypoints to stay within the image boundaries. + + Args: + kpts (numpy.ndarray): Array of bounding keypoints in format [..., 17, 3] where the last dim is (x, y, visible). + h (int): Height of the image. + w (int): Width of the image. + + Returns: + numpy.ndarray: Clipped bounding boxes. + """ + kpts[..., 0] = np.clip(kpts[..., 0], a_min=0, a_max=h) + kpts[..., 1] = np.clip(kpts[..., 1], a_min=0, a_max=w) + return kpts + + +PARTS = { + 0: 'Nose', + 1: 'EyeL', + 2: 'EyeR', + 3: 'EarL', + 4: 'EarR', + 5: 'SholderL', + 6: 'SholderR', + 7: 'ElbowL', + 8: 'ElbowR', + 9: 'WristL', + 10: 'WristR', + 11: 'HipL', + 12: 'HipR', + 13: 'KneeL', + 14: 'KneeR', + 15: 'AnkleL', + 16: 'AnkleR' +} + + +class COCODrawer: + def __init__(self, categories, imx500, needs_rescale_coords=True): + self.categories = categories + self.imx500 = imx500 + self.needs_rescale_coords = needs_rescale_coords + + def get_coords(self, annotation, metadata: dict, picam2: Picamera2, stream): + if self.needs_rescale_coords: + obj_scaled = self.imx500.convert_inference_coords(annotation, metadata, picam2, stream) + x0 = obj_scaled.x + y0 = obj_scaled.y + x1 = x0 + obj_scaled.width + y1 = y0 + obj_scaled.height + else: + y0, x0, y1, x1 = annotation + y0 = max(0, y0) + x0 = max(0, x0) + return int(y0), int(x0), int(y1), int(x1) + + def draw_bounding_box(self, img, annotation, class_id, score, metadata: dict, picam2: Picamera2, stream): + y0, x0, y1, x1 = self.get_coords(annotation, metadata, picam2, stream) + text = f"{self.categories[int(class_id)]}:{score:.3f}" + cv2.rectangle(img, (x0, y0), (x1, y1), (0, 0, 255), 2) + cv2.putText(img, text, (x0, y0), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) + + def draw_keypoints(self, img, keypoints, min_confidence, metadata: dict, picam2: Picamera2, stream): + def get_point(index): + y0, x0 = keypoints[index][1], keypoints[index][0] + y0, x0, _, _ = self.get_coords((y0, x0, y0 + 1, x0 + 1), metadata, picam2, stream) + return x0, y0 + + skeleton = [ + [0, 1], [0, 2], [1, 3], [2, 4], # Head + [5, 6], [5, 7], [7, 9], [6, 8], # Arms + [8, 10], [5, 11], [6, 12], [11, 12], # Body + [11, 13], [12, 14], [13, 15], [14, 16] # Legs + ] + + # Draw skeleton lines + for connection in skeleton: + start_point = get_point(connection[0]) + end_point = get_point(connection[1]) + start_confidence = keypoints[connection[0]][2] + end_confidence = keypoints[connection[1]][2] + if start_confidence < min_confidence or end_confidence < min_confidence: + continue + cv2.line(img, start_point, end_point, (255, 0, 0), 2) + + # Draw keypoints as colored circles + for i in range(len(keypoints)): + x, y = get_point(i) + confidence = keypoints[i][2] + if confidence < min_confidence: + continue + cv2.circle(img, (x, y), 3, (0, 255, 0), -1) + label = f"{PARTS.get(i)}.{confidence:.3f}" + cv2.putText(img, label, (x + 5, y + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (0, 255, 0), 1) + + def annotate_image(self, img, b, s, c, k, box_min_conf, kps_min_conf, metadata: dict, picam2: Picamera2, stream): + for index, row in enumerate(b): + if s[index] >= box_min_conf: + self.draw_bounding_box(img, row, c[index], s[index], metadata, picam2, stream) + if k is not None: + self.draw_keypoints(img, k[index], kps_min_conf, metadata, picam2, stream) + + def overlay_masks(self, picam2, masks, scores, colors, score_threshold=0.55, mask_threshold=0.5): + overlay = np.zeros((masks.shape[1], masks.shape[2], 4), dtype=np.uint8) + for idx, (mask, score) in enumerate(zip(masks, scores)): + if score > score_threshold: # Check if the score is above the threshold + binary_mask = (mask > mask_threshold).astype(np.uint8) + color = np.array(colors[idx][:3]) * 255 # Convert color to 0-255 scale + overlay[binary_mask == 1, :3] = color + overlay[binary_mask == 1, 3] = 127 # opacity + picam2.set_overlay(overlay) + + +def softmax(x): + y = np.exp(x - np.expand_dims(np.max(x, axis=-1), axis=-1)) + z = y / np.expand_dims(np.sum(y, axis=-1), axis=-1) + return z + + +def crop_mask(masks, boxes): + """ + It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box + + Args: + masks (numpy.ndarray): [h, w, n] tensor of masks + boxes (numpy.ndarray): [n, 4] tensor of bbox coordinates in relative point form + + Returns: + (numpy.ndarray): The masks are being cropped to the bounding box. + """ + n, w, h = masks.shape + x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1) + c = np.arange(h, dtype=np.float32)[None, None, :] # rows shape(1,w,1) + r = np.arange(w, dtype=np.float32)[None, :, None] # cols shape(h,1,1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) diff --git a/picamera2/devices/imx500/postprocess_efficientdet_lite0.py b/picamera2/devices/imx500/postprocess_efficientdet_lite0.py new file mode 100644 index 00000000..3ffd7fd4 --- /dev/null +++ b/picamera2/devices/imx500/postprocess_efficientdet_lite0.py @@ -0,0 +1,213 @@ +""" +Efficientdet postprocessing + +This code is based on: +https://github.com/google/automl/tree/master/efficientdet +""" + +from typing import Tuple + +import numpy as np + +from picamera2.devices.imx500.postprocess import ( + BoxFormat, convert_to_ymin_xmin_ymax_xmax_format, nms) +from picamera2.devices.imx500.postprocess_yolov5 import coco80_to_coco91 + +default_box_variance = [1.0, 1.0, 1.0, 1.0] +default_aspect_ratios = [1.0, 2.0, 0.5] + + +def postprocess_efficientdet_lite0_detection(outputs: Tuple[np.ndarray, np.ndarray, np.ndarray], + anchor_scale=3, + min_level=3, + max_level=7, + box_variance=default_box_variance, + model_input_shape=(320, 320), + min_wh=2, + max_wh=7680, + conf_thres: float = 0.001, + iou_thres: float = 0.65, + max_nms_dets: int = 5000, + max_out_dets: int = 1000): + H, W = model_input_shape + ############################################################ + # Box decoding + ############################################################ + outputs_decoded = box_decoding_edetlite(output_annotations=outputs, + H=H, + W=W, + anchor_scale=anchor_scale, + min_level=min_level, + max_level=max_level, + box_variance=box_variance) + + classes = outputs[0] + num_categories = classes.shape[-1] + + ############################################################ + # Post processing for each input image + ############################################################ + # Note: outputs_decoded shape is [Batch,num_anchors*Detections,(4+1+num_categories)] + post_processed_outputs = [] + for _, x in enumerate(outputs_decoded): + # ---------------------------------------- + # Filter by score and width-height + # ---------------------------------------- + scores = x[..., 4] + wh = x[..., 2:4] + valid_indexs = (scores > conf_thres) & ((wh > min_wh).any(1)) & ((wh < max_wh).any(1)) + x = x[valid_indexs] + + # ---------------------------------------- + # Taking Best class only + # ---------------------------------------- + x[..., 5:] *= x[..., 4:5] # compute confidence per class (class_score * object_score) + conf = np.max(x[:, 5:], axis=1, keepdims=True) + classes_id = np.argmax(x[:, 5:], axis=1, keepdims=True) + + # Change boxes format from [x_c,y_c,w,h] to [y_min,x_min,y_max,x_max] + boxes = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H) + x = np.concatenate((boxes, conf, classes_id), axis=1)[conf.reshape(-1) > conf_thres] + + # --------------------------- # + # NMS + # --------------------------- # + x = x[np.argsort(-x[:, 4])[:max_nms_dets]] # sort by confidence from high to low + offset = x[..., 5:6] * np.maximum(H, W) + boxes_offset, scores = x[..., :4] + offset, x[..., 4] # boxes with offset by class + valid_indexs = nms(dets=boxes_offset, scores=scores, iou_thres=iou_thres, max_out_dets=max_out_dets) + x = x[valid_indexs] + + boxes = x[..., :4] + + # --------------------------- # + # Classes process + # --------------------------- # + # convert classes from coco80 to coco91 to match labels + classes = coco80_to_coco91(x[..., 5]) if num_categories == 80 else x[..., 5] + classes -= 0 + + # --------------------------- # + # Scores + # --------------------------- # + scores = x[..., 4] + + # Add result + post_processed_outputs.append({'boxes': boxes, 'classes': classes, 'scores': scores}) + + return post_processed_outputs[0]['boxes'], post_processed_outputs[0]['scores'], post_processed_outputs[0]['classes'] + + +def box_decoding_edetlite(output_annotations, + H=320, + W=320, + anchor_scale=3, + min_level=3, + max_level=7, + box_variance=default_box_variance): + # ----------------------------------------------- + # EfficientDetLite detection post processing + # ----------------------------------------------- + # Note: 'output_annotations' is expected to be a list of 2 feature maps with shapes: + # [0] : [Batch,Detections,num_categories] + # [1] : [Batch,Detections,4] + classes = output_annotations[0] + boxes = output_annotations[1] + classes = 1 / (1 + np.exp(-classes)) # sigmoid + scores = np.ones((*boxes.shape[:-1], 1)) # Add default object scores of 1.0 + + # Combine tensors + outputs = np.concatenate((boxes, scores, classes), axis=2) + + # Box decoding + # Anchor boxes format: [y_min, x_min, y_max, x_max] normalized + + # Extract feature map sizes + strides = [2 ** i for i in range(max_level + 1)] + featmap_sizes = [(np.ceil(H / stride), np.ceil(W / stride)) for stride in strides] + + # Generate priors + batch_size = outputs.shape[0] + anchors = generate_anchors_EDETLITE(batch_size=batch_size, + featmap_sizes=featmap_sizes, + H=H, + W=W, + anchor_scale=anchor_scale, + min_level=min_level, + max_level=max_level) + + # Decode bboxes + y_c_anchors = (anchors[..., 0:1] + anchors[..., 2:3]) / 2 + x_c_anchors = (anchors[..., 1:2] + anchors[..., 3:4]) / 2 + ha = anchors[..., 2:3] - anchors[..., 0:1] + wa = anchors[..., 3:4] - anchors[..., 1:2] + + # Output Box format: [x_c, y_c, w, h] + pred_boxes = outputs[..., :4] + y_c = pred_boxes[..., 0:1] * box_variance[0] * ha + y_c_anchors + x_c = pred_boxes[..., 1:2] * box_variance[1] * wa + x_c_anchors + h = np.exp(pred_boxes[..., 2:3] * box_variance[2]) * ha + w = np.exp(pred_boxes[..., 3:4] * box_variance[3]) * wa + outputs[..., 0:1] = x_c + outputs[..., 1:2] = y_c + outputs[..., 2:3] = w + outputs[..., 3:4] = h + return outputs + + +def generate_anchors_EDETLITE(batch_size, + featmap_sizes, + H=320, + W=320, + anchor_scale=3, + min_level=3, + max_level=7, + aspect_ratios=default_aspect_ratios): + """Generate configurations of anchor boxes.""" + anchor_scales = [anchor_scale] * (max_level - min_level + 1) + num_scales = len(aspect_ratios) + anchor_configs = {} + for level in range(min_level, max_level + 1): + anchor_configs[level] = [] + for scale_octave in range(num_scales): + for aspect in aspect_ratios: + anchor_configs[level].append( + ((featmap_sizes[0][0] / float(featmap_sizes[level][0]), + featmap_sizes[0][1] / float(featmap_sizes[level][1])), + scale_octave / float(num_scales), aspect, + anchor_scales[level - min_level])) + + """Generates multiscale anchor boxes.""" + boxes_all = [] + for _, configs in anchor_configs.items(): + boxes_level = [] + for config in configs: + stride, octave_scale, aspect, anchor_scale = config + base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale + base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale + if isinstance(aspect, list): + aspect_x, aspect_y = aspect + else: + aspect_x = np.sqrt(aspect) + aspect_y = 1.0 / aspect_x + anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0 + anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0 + + x = np.arange(stride[1] / 2, W, stride[1]) + y = np.arange(stride[0] / 2, H, stride[0]) + xv, yv = np.meshgrid(x, y) + xv = xv.reshape(-1) + yv = yv.reshape(-1) + + boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2, + yv + anchor_size_y_2, xv + anchor_size_x_2)) + boxes = np.swapaxes(boxes, 0, 1) + boxes_level.append(np.expand_dims(boxes, axis=1)) + + # concat anchors on the same level to the shape Batch x Detections x 4 + boxes_level = np.concatenate(boxes_level, axis=1).reshape([1, -1, 4]) + boxes_level = np.repeat(boxes_level, batch_size, axis=0) + boxes_all.append(boxes_level) + + anchor_boxes = np.concatenate(boxes_all, axis=1) + return anchor_boxes diff --git a/picamera2/devices/imx500/postprocess_highernet.py b/picamera2/devices/imx500/postprocess_highernet.py new file mode 100644 index 00000000..6cd97539 --- /dev/null +++ b/picamera2/devices/imx500/postprocess_highernet.py @@ -0,0 +1,562 @@ +""" +Highernet postprocessing + +This code is based on multiple sources: +https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation +https://github.com/princeton-vl/pose-ae-train +https://github.com/yinguobing/facial-landmark-detection-hrnet +""" + +from typing import Tuple + +import cv2 +import numpy as np + +try: + from munkres import Munkres +except ImportError: + raise ImportError("Please install munkres first. `pip3 install --break-system-packages munkres`") + +default_joint_order = [0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16] + + +def postprocess_higherhrnet(outputs: list[np.ndarray, np.ndarray], + img_size, + img_w_pad, + img_h_pad, + network_postprocess, + num_joints=17, + tag_per_joint=True, + joint_order=default_joint_order, + detection_threshold=0.3, + max_num_people=30, + nms_kernel=5, + nms_padding=2, + ignore_too_much=False, + use_detection_val=True, + tag_threshold=1.0, + adjust=False, + refine=False, + input_image_size=(288, 384), + output_shape=(144, 192)) -> Tuple[list[list], list, list[list]]: + all_preds = [] + all_scores = [] + if network_postprocess: + # outputs [[B, max_num_people, num_joints], [B, max_num_people, num_joints], [B, max_num_people, num_joints]] + grouped, scores = parse(network_outputs=[outputs[0][0, ...], + outputs[1][0, ...], + outputs[2][0, ...]], + output_shape=output_shape, + adjust=adjust, + refine=refine, + network_postprocess=network_postprocess, + tag_per_joint=tag_per_joint, + max_num_people=max_num_people, + nms_kernel=nms_kernel, + nms_padding=nms_padding, + num_joints=num_joints, + joint_order=joint_order, + detection_threshold=detection_threshold, + ignore_too_much=ignore_too_much, + use_detection_val=use_detection_val, + tag_threshold=tag_threshold) + else: + out0 = outputs[0][0] + out1 = outputs[1][0] + + # postprocess: + # resize first output to 2nd output size + out0 = ResizeBilinear(out0, out1.shape[0], out1.shape[1]) + # average heatmaps from both outputs + heatmaps = (out0[..., :17] + out1) / 2 + tags = out0[..., 17:] + grouped, scores = parse(network_outputs=[heatmaps, tags], + output_shape=output_shape, + adjust=adjust, + refine=refine, + network_postprocess=network_postprocess, + tag_per_joint=tag_per_joint, + max_num_people=max_num_people, + nms_kernel=nms_kernel, + nms_padding=nms_padding, + num_joints=num_joints, + joint_order=joint_order, + detection_threshold=detection_threshold, + ignore_too_much=ignore_too_much, + use_detection_val=use_detection_val, + tag_threshold=tag_threshold) + + # scale keypoints coordinates to input image size + scale_factor = (np.array(input_image_size) / output_shape).reshape((1, 1, 2)) + for img_index in range(len(grouped)): + if grouped[img_index].shape[0] > 0: + # rescale to preprocessed input image size + grouped[img_index][:, :, :2] = grouped[img_index][:, :, :2] * scale_factor + # remove pad offset: + grouped[img_index][:, :, 0] = grouped[img_index][:, :, 0] - img_w_pad[0] + grouped[img_index][:, :, 1] = grouped[img_index][:, :, 1] - img_h_pad[0] + # rescale to original image size + resized_input_image = np.array(input_image_size) - np.array( + (sum(img_h_pad), + sum(img_w_pad))) + s = (np.array(img_size) / resized_input_image).reshape((1, 1, 2)) + grouped[img_index][:, :, :2] = grouped[img_index][:, :, :2] * s + + # Calculate zero keypoint + zero_kpt = np.zeros((1, 4)) + resized_input_image = np.array(input_image_size) - np.array( + (sum(img_h_pad), + sum(img_w_pad))) + s = (np.array(img_size) / resized_input_image).reshape((1, 1, 2)) + zero_kpt[:, 0] = zero_kpt[:, 0] - img_w_pad[0] + zero_kpt[:, 1] = zero_kpt[:, 1] - img_h_pad[0] + zero_kpt[:, :2] = zero_kpt[:, :2] * s + + all_preds.append(grouped) + all_scores.append(scores) + + kpts = [] + # one image, one iter + for idx, _kpts in enumerate(all_preds): + for idx_kpt, kpt in enumerate(_kpts[0]): + area = (np.max(kpt[:, 0]) - np.min(kpt[:, 0])) * (np.max(kpt[:, 1]) - np.min(kpt[:, 1])) + # kpt [17, 4] + kpt = processKeypoints(kpt) + kpts.append( + { + 'keypoints': kpt[:, 0:3], + 'score': all_scores[idx][idx_kpt], + 'tags': kpt[:, 3], + 'area': area + } + ) + # _coco_keypoint_results_one_category_kernel + out_keypoints = [] + out_scores = [] + out_bbox = [] + + # for img_kpts in kpts: + img_kpts = kpts + if len(img_kpts) == 0: + return [], [], [] + + _key_points = np.array( + [img_kpts[k]['keypoints'] for k in range(len(img_kpts))] + ) + key_points = np.zeros( + (_key_points.shape[0], num_joints * 3), + dtype=np.float32 + ) + + for ipt in range(num_joints): + key_points[:, ipt * 3 + 0] = _key_points[:, ipt, 0] + key_points[:, ipt * 3 + 1] = _key_points[:, ipt, 1] + key_points[:, ipt * 3 + 2] = _key_points[:, ipt, 2] # keypoints score. + + for k in range(len(img_kpts)): + kpt = key_points[k].reshape((num_joints, 3)) + # ignore zero kpts + mask = np.isin(kpt, zero_kpt) + kpt = np.where(mask, np.nan, kpt) + left_top = np.nanmin(kpt, axis=0) + right_bottom = np.nanmax(kpt, axis=0) + + out_keypoints.append(list(key_points[k])) + out_scores.append(img_kpts[k]['score']) + out_bbox.append([left_top[1], left_top[0], right_bottom[1], right_bottom[0]]) + return out_keypoints, out_scores, out_bbox + + +def parse(network_outputs, + output_shape, + adjust=False, + refine=False, + network_postprocess=False, + tag_per_joint=17, + max_num_people=30, + nms_kernel=5, + nms_padding=2, + num_joints=17, + joint_order=default_joint_order, + detection_threshold=0.1, + ignore_too_much=False, + use_detection_val=True, + tag_threshold=1.0 + ): + if network_postprocess: + tag_k, ind_k, val_k = network_outputs + x = ind_k % output_shape[1] + y = (ind_k / output_shape[1]).astype(ind_k.dtype) + ind_k = np.stack([x, y], axis=2) + + topk_output_dict = {'tag_k': tag_k[np.newaxis, ...], + 'loc_k': ind_k[np.newaxis, ...], + 'val_k': val_k[np.newaxis, ...], + } + else: + det, tag = network_outputs + # topk_output_dict + # {'tag_k': [num_images, max_num_people, num_joints], + # 'loc_k': [num_images, max_num_people, num_joints, 2], + # 'val_k': [num_images, max_num_people, num_joints]} + topk_output_dict = top_k(det=det, + tag=tag, + tag_per_joint=tag_per_joint, + max_num_people=max_num_people, + nms_kernel=nms_kernel, + nms_padding=nms_padding) + # ans [num_joints_detected, num_joints, 4] + ans = match(tag_k=topk_output_dict['tag_k'], + loc_k=topk_output_dict['loc_k'], + val_k=topk_output_dict['val_k'], + num_joints=num_joints, + joint_order=joint_order, + detection_threshold=detection_threshold, + max_num_people=max_num_people, + ignore_too_much=ignore_too_much, + use_detection_val=use_detection_val, + tag_threshold=tag_threshold) + if adjust: + # ans [[num_joints_detected, num_joints, 4]] + ans = adjust_func(ans, det[np.newaxis, ...]) # TODO support batch size > 1 + + scores = [i[:, 2].mean() for i in ans[0]] + + if refine: + ans = ans[0] + # for every detected person + for _ in range(len(ans)): + # NotImplemented + if not tag_per_joint: + raise NotImplementedError + + # ans [[num_joints_detected, num_joints, 4]] + ans = [ans] + return ans, scores + + +def ResizeBilinear(img, new_height, new_width): + return cv2.resize(img, (new_width, new_height)) + + +def top_k(det, + tag, + tag_per_joint=17, + max_num_people=30, + nms_kernel=5, + nms_padding=2): + # det [144, 192, 17] + # tag [144, 192, 17] + + # det [144, 192, 17] + det = nms(det, + nms_kernel=nms_kernel, + nms_padding=nms_padding) + # num_images 1 + # h 144 + # w 192 + # num_joints 17 + num_images, h, w, num_joints = (1,) + det.shape # TODO: support multiple images (batch>1) + + # det [num_images, h*w, num_joints] + det = det.reshape((num_images, -1, num_joints)) + # val_k [num_images, max_num_people, num_joints] + val_k, ind = np_topk(det, max_num_people) + + # tag [num_images, h*w, num_joints] + tag = tag.reshape((num_images, -1, num_joints)) + + # NotImplemented + if not tag_per_joint: + raise NotImplementedError + tag = tag.expand(-1, num_joints, -1, -1) + + # tag_k [num_images, max_num_people, num_joints] + tag_k = np.zeros((num_images, max_num_people, num_joints)) + for img in range(num_images): + for kp in range(num_joints): + tag_k[img, :, kp] = tag[img, ind[img, :, kp], kp] + + x = ind % w + y = (ind / w).astype(ind.dtype) + + # ind_k [num_images, max_num_people, num_joints, 2] + ind_k = np.stack([x, y], axis=3) + + # {'tag_k': [num_images, max_num_people, num_joints], + # 'loc_k': [num_images, max_num_people, num_joints, 2], + # 'val_k': [num_images, max_num_people, num_joints]} + return {'tag_k': tag_k, + 'loc_k': ind_k, + 'val_k': val_k, + } + + +def nms(det, + nms_kernel=5, + nms_padding=2): + # det [144, 192, 17] + # maxm [144, 192, 17] + maxm = np_max_pool(det, k=nms_kernel, p=nms_padding) + maxm = np.equal(maxm, det).astype(np.float32) + det = det * maxm + return det + + +def np_max_pool(x, + k=5, + p=2, + p_value=0): + # x [144, 192, 17] + # k - kernel size (h, w) + # p - padding size (top, bottom, left, right) + if isinstance(k, int): + k = (k, k) + if isinstance(p, int): + p = ((p, p), (p, p), (0, 0)) + elif isinstance(p, (list, tuple)) and len(p) == 2: + p = ((p[0], p[0]), (p[1], p[1]), (0, 0)) + + # y [148, 196, 17 + y = np.pad(x, p) + out = np.concatenate( + [np.max(np.concatenate([y[ky:ky + y.shape[0] - k[0] + 1, kx:kx + y.shape[1] - k[1] + 1, c:c + 1] + for ky in range(k[0]) + for kx in range(k[1])], 2), axis=2, keepdims=True) for c in range(y.shape[2])], 2) + # out [144, 192, 17] + return out + + +def np_topk(x, k): + # x [1, 27648, 17] + # n_images 1 + # n_keypoints 17 + n_images, _, n_keypoints = x.shape + # vals [1, k, 17] + # inds [1, k, 17] + vals = np.zeros((n_images, k, n_keypoints), dtype=x.dtype) + inds = np.zeros((n_images, k, n_keypoints), dtype=np.int64) + for img in range(n_images): + for kp in range(n_keypoints): + # _inds [k] + _inds = np.argpartition(x[img, :, kp], -k)[-k:] + _inds = _inds[np.argsort(x[img, _inds, kp], )][::-1] + inds[img, :, kp] = _inds + vals[img, :, kp] = x[img, _inds, kp] + return vals, inds + + +def match(tag_k, + loc_k, + val_k, + num_joints=17, + joint_order=default_joint_order, + detection_threshold=0.1, + max_num_people=30, + ignore_too_much=False, + use_detection_val=True, + tag_threshold=1.0): + def m(x): + return match_by_tag(inp=x, + num_joints=num_joints, + joint_order=joint_order, + detection_threshold=detection_threshold, + max_num_people=max_num_people, + ignore_too_much=ignore_too_much, + use_detection_val=use_detection_val, + tag_threshold=tag_threshold) + return list(map(m, zip(tag_k, loc_k, val_k))) + + +def match_by_tag(inp, + num_joints=17, + joint_order=default_joint_order, + detection_threshold=0.1, + max_num_people=30, + ignore_too_much=False, + use_detection_val=True, + tag_threshold=1.0): + # tag_k [num_images, max_num_people, num_joints] + # loc_k [num_images, max_num_people, num_joints, 2] + # val_k [num_images, max_num_people, num_joints] + tag_k, loc_k, val_k = inp + # default_ [num_joints, 4] + default_ = np.zeros((num_joints, 3 + 1)) # tag_k.shape[2] assumed to be 1 # pytorch shape: (17, 4) + + joint_dict = {} + tag_dict = {} + for i in range(num_joints): + idx = joint_order[i] + + # tags [max_num_people, 1] + tags = tag_k[:, idx:idx + 1] + # joints [max_num_people, 4] + joints = np.concatenate((loc_k[:, idx, :], val_k[:, idx:idx + 1], tags), 1) + # mask [max_num_people] + mask = joints[:, 2] > detection_threshold + tags = tags[mask] + joints = joints[mask] + + if joints.shape[0] == 0: + continue + + if i == 0 or len(joint_dict) == 0: + for tag, joint in zip(tags, joints): + key = tag[0] + joint_dict.setdefault(key, np.copy(default_))[idx] = joint + tag_dict[key] = [tag] + else: + grouped_keys = list(joint_dict.keys())[:max_num_people] + grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys] + + if ignore_too_much \ + and len(grouped_keys) == max_num_people: + continue + + diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :] + diff_normed = np.linalg.norm(diff, ord=2, axis=2) + diff_saved = np.copy(diff_normed) + + if use_detection_val: + diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3] + + num_added = diff.shape[0] + num_grouped = diff.shape[1] + + if num_added > num_grouped: + diff_normed = np.concatenate( + ( + diff_normed, + np.zeros((num_added, num_added - num_grouped)) + 1e10 + ), + axis=1 + ) + + pairs = py_max_match(diff_normed) + for row, col in pairs: + if ( + row < num_added + and col < num_grouped + and diff_saved[row][col] < tag_threshold + ): + key = grouped_keys[col] + joint_dict[key][idx] = joints[row] + tag_dict[key].append(tags[row]) + else: + key = tags[row][0] + joint_dict.setdefault(key, np.copy(default_))[idx] = \ + joints[row] + tag_dict[key] = [tags[row]] + + # ans [len(joint_dict), num_joints, 4] + ans = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32) + return ans + + +def py_max_match(scores): + m = Munkres() + tmp = m.compute(scores) + tmp = np.array(tmp).astype(np.int32) + return tmp + + +def adjust_func(ans, det): + # ans [[num_joints_detected, num_joints, 4]] + # det [144, 192, 17] + for batch_id, people in enumerate(ans): + for people_id, i in enumerate(people): + for joint_id, joint in enumerate(i): + if joint[2] > 0: + y, x = joint[0:2] + xx, yy = int(x), int(y) + # print(batch_id, joint_id, det[batch_id].shape) + tmp = det[batch_id][..., joint_id] + if tmp[xx, min(yy + 1, tmp.shape[1] - 1)] > tmp[xx, max(yy - 1, 0)]: + y += 0.25 + else: + y -= 0.25 + + if tmp[min(xx + 1, tmp.shape[0] - 1), yy] > tmp[max(0, xx - 1), yy]: + x += 0.25 + else: + x -= 0.25 + ans[batch_id][people_id, joint_id, 0:2] = (y + 0.5, x + 0.5) + # ans [[num_joints_detected, num_joints, 4]] + return ans + + +def refine_func(det, tag, keypoints): + # det [144, 192, 17] + # tag [144, 192, 17] + # keypoints [num_joints, 4] + if len(tag.shape) == 3: + # tag shape: (17, 128, 128, 1) + # tag [144, 192, 17, 1] + tag = tag[:, :, :, None] + + tags = [] + for i in range(keypoints.shape[0]): + if keypoints[i, 2] > 0: + # save tag value of detected keypoint + x, y = keypoints[i][:2].astype(np.int32) + tags.append(tag[y, x, i]) + + # mean tag of current detected people + prev_tag = np.mean(tags, axis=0) + ans = [] + + for i in range(keypoints.shape[0]): + # score of joints i at all position + tmp = det[:, :, i] + # distance of all tag values with mean tag of current detected people + tt = (((tag[:, :, i] - prev_tag[None, None, :]) ** 2).sum(axis=2) ** 0.5) + tmp2 = tmp - np.round(tt) + + # find maximum position + y, x = np.unravel_index(np.argmax(tmp2), tmp.shape) + xx = x + yy = y + # detection score at maximum position + val = tmp[y, x] + # offset by 0.5 + x += 0.5 + y += 0.5 + + # add a quarter offset + if tmp[yy, min(xx + 1, tmp.shape[1] - 1)] > tmp[yy, max(xx - 1, 0)]: + x += 0.25 + else: + x -= 0.25 + + if tmp[min(yy + 1, tmp.shape[0] - 1), xx] > tmp[max(0, yy - 1), xx]: + y += 0.25 + else: + y -= 0.25 + + ans.append((x, y, val)) + ans = np.array(ans) + + if ans is not None: + for i in range(det.shape[2]): + # add keypoint if it is not detected + if ans[i, 2] > 0 and keypoints[i, 2] == 0: + # if ans[i, 2] > 0.01 and keypoints[i, 2] == 0: + keypoints[i, :2] = ans[i, :2] + keypoints[i, 2] = ans[i, 2] + # keypoints [num_joints_detected, num_joints, 4] + return keypoints + + +def processKeypoints(keypoints): + # keypoints [17, 4] + tmp = keypoints.copy() + if keypoints[:, 2].max() > 0: + num_keypoints = keypoints.shape[0] + for i in range(num_keypoints): + tmp[i][0:3] = [ + float(keypoints[i][0]), + float(keypoints[i][1]), + float(keypoints[i][2]) + ] + + return tmp diff --git a/picamera2/devices/imx500/postprocess_nanodet.py b/picamera2/devices/imx500/postprocess_nanodet.py new file mode 100644 index 00000000..b7f1d0b8 --- /dev/null +++ b/picamera2/devices/imx500/postprocess_nanodet.py @@ -0,0 +1,63 @@ +""" +Nanodet postprocessing + +This code is based on: +https://github.com/RangiLyu/nanodet +""" + +from typing import Tuple + +import numpy as np + +from picamera2.devices.imx500.postprocess import combined_nms, softmax + + +def postprocess_nanodet_detection(outputs, + conf: float = 0.0, + iou_thres: float = 0.65, + max_out_dets: int = 300) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + reg_max = 7 + num_categories = 80 + classes = outputs[..., :num_categories] + boxes = outputs[..., num_categories:] + classes = 1 / (1 + np.exp(-classes)) # sigmoid + + # Extract feature map sizes + strides = [8, 16, 32, 64] + featmap_sizes = [(np.ceil(416 / stride), np.ceil(416 / stride)) for stride in strides] + + # Generate priors + anchors = generate_anchors_NANODET(featmap_sizes, strides) + + # Decode bboxes + batch = boxes.shape[0] + x = np.reshape(boxes, newshape=(batch, -1, 4, reg_max + 1)) + x = softmax(x) + x = np.matmul(x, np.arange(0, reg_max + 1, 1, dtype=np.float32)) + x = np.reshape(x, newshape=(batch, -1, 4)) + distances = x * anchors[..., 2, None] + + # Output Box format: [x_c, y_c, w, h] + w = distances[..., 0:1] + distances[..., 2:3] + h = distances[..., 1:2] + distances[..., 3:4] + x_c = anchors[..., 0:1] - distances[..., 0:1] + w / 2 + y_c = anchors[..., 1:2] - distances[..., 1:2] + h / 2 + boxes = np.concatenate([x_c, y_c, w, h], axis=2) + + return combined_nms(boxes, classes, iou_thres, conf, max_out_dets) + + +def generate_anchors_NANODET(featmap_sizes, strides): + anchors_list = [] + for i, stride in enumerate(strides): + h, w = featmap_sizes[i] + x_range = np.arange(w) * stride + y_range = np.arange(h) * stride + y, x = np.meshgrid(y_range, x_range) + y = y.flatten() + x = x.flatten() + strides = np.ones_like(x) * stride + anchors = np.stack([y, x, strides, strides], axis=-1) + anchors = np.expand_dims(anchors, axis=0) + anchors_list.append(anchors) + return np.concatenate(anchors_list, axis=1) diff --git a/picamera2/devices/imx500/postprocess_yolov5.py b/picamera2/devices/imx500/postprocess_yolov5.py new file mode 100644 index 00000000..d9c5600f --- /dev/null +++ b/picamera2/devices/imx500/postprocess_yolov5.py @@ -0,0 +1,244 @@ +""" +Yolov5 postprocessing + +This code is based on: +https://github.com/ultralytics/ultralytics +""" +from typing import List + +import cv2 +import numpy as np + +from picamera2.devices.imx500.postprocess import ( + BoxFormat, convert_to_ymin_xmin_ymax_xmax_format, nms) + +default_anchors = [[10, 13, 16, 30, 33, 23], + [30, 61, 62, 45, 59, 119], + [116, 90, 156, 198, 373, 326]] +default_strides = [8, 16, 32] + + +def postprocess_yolov5_detection(outputs: List[np.ndarray], + model_input_shape=(640, 640), + num_categories=80, + min_wh=2, + max_wh=7680, + conf_thres: float = 0.001, + iou_thres: float = 0.65, + max_nms_dets: int = 5000, + max_out_dets: int = 1000): + H, W = model_input_shape + ############################################################ + # Box decoding + ############################################################ + outputs_decoded = box_decoding_yolov5n(tensors=outputs, num_categories=num_categories, H=H, W=W) + + ############################################################ + # Post processing for each input image + ############################################################ + # Note: outputs_decoded shape is [Batch,num_anchors*Detections,(4+1+num_categories)] + post_processed_outputs = [] + for _, x in enumerate(outputs_decoded): + # ---------------------------------------- + # Filter by score and width-height + # ---------------------------------------- + scores = x[..., 4] + wh = x[..., 2:4] + valid_indexs = (scores > conf_thres) & ((wh > min_wh).any(1)) & ((wh < max_wh).any(1)) + x = x[valid_indexs] + + # ---------------------------------------- + # Taking Best class only + # ---------------------------------------- + x[..., 5:] *= x[..., 4:5] # compute confidence per class (class_score * object_score) + conf = np.max(x[:, 5:], axis=1, keepdims=True) + classes_id = np.argmax(x[:, 5:], axis=1, keepdims=True) + + # Change boxes format from [x_c,y_c,w,h] to [y_min,x_min,y_max,x_max] + boxes = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H) + x = np.concatenate((boxes, conf, classes_id), axis=1)[conf.reshape(-1) > conf_thres] + + # --------------------------- # + # NMS + # --------------------------- # + x = x[np.argsort(-x[:, 4])[:max_nms_dets]] # sort by confidence from high to low + offset = x[..., 5:6] * np.maximum(H, W) + boxes_offset, scores = x[..., :4] + offset, x[..., 4] # boxes with offset by class + valid_indexs = nms(dets=boxes_offset, scores=scores, iou_thres=iou_thres, max_out_dets=max_out_dets) + x = x[valid_indexs] + + boxes = x[..., :4] + # --------------------------- # + # Classes process + # --------------------------- # + # convert classes from coco80 to coco91 to match labels + classes = coco80_to_coco91(x[..., 5]) if num_categories == 80 else x[..., 5] + classes -= 1 + + # --------------------------- # + # Scores + # --------------------------- # + scores = x[..., 4] + + # Add result + post_processed_outputs.append({'boxes': boxes, 'classes': classes, 'scores': scores}) + + return post_processed_outputs[0]['boxes'], post_processed_outputs[0]['scores'], post_processed_outputs[0]['classes'] + + +def box_decoding_yolov5n(tensors, + num_categories=80, + H=640, + W=640, + anchors=default_anchors, + strides=default_strides): + # Tensors box format: [x_c, y_c, w, h] + no = num_categories + 5 # number of outputs per anchor + nl = len(anchors) # number of detection layers + na = len(anchors[0]) // 2 # number of anchors + anchor_grid = np.reshape(np.array(anchors), [nl, 1, -1, 1, 2]) + anchor_grid = anchor_grid.astype(np.float32) + z = [] + for i in range(nl): + ny, nx = H // strides[i], W // strides[i] + xv, yv = np.meshgrid(np.arange(nx), np.arange(ny)) + grid = np.reshape(np.stack([xv, yv], 2), [1, 1, ny * nx, 2]).astype(np.float32) + + y = tensors[i] + y = np.transpose(y, [0, 2, 1, 3]) + xy = (y[..., 0:2] * 2 - 0.5 + grid) * strides[i] # xy + wh = (y[..., 2:4] * 2) ** 2 * anchor_grid[i] + + # Output box format: [x_c, y_c, w, h] + y = np.concatenate([xy, wh, y[..., 4:]], -1) + z.append(np.reshape(y, [-1, na * ny * nx, no])) + + return np.concatenate(z, 1) + + +# same as in preprocess but differs in h/w location +def scale_boxes(boxes: np.ndarray, h_image: int, w_image: int, h_model: int, w_model: int, + preserve_aspect_ratio: bool) -> np.ndarray: + """ + Scale and offset bounding boxes based on model output size and original image size. + + Args: + boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max]. + h_image (int): Original image height. + w_image (int): Original image width. + h_model (int): Model output height. + w_model (int): Model output width. + preserve_aspect_ratio (bool): Whether to preserve image aspect ratio during scaling + + Returns: + numpy.ndarray: Scaled and offset bounding boxes. + """ + deltaH, deltaW = 0, 0 + H, W = h_model, w_model + scale_H, scale_W = h_image / H, w_image / W + + if preserve_aspect_ratio: + scale_H = scale_W = max(h_image / H, w_image / W) + H_tag = int(np.round(h_image / scale_H)) + W_tag = int(np.round(w_image / scale_W)) + deltaH, deltaW = int((H - H_tag) / 2), int((W - W_tag) / 2) + + # Scale and offset boxes + boxes[..., 0] = (boxes[..., 0] - deltaH) * scale_H + boxes[..., 1] = (boxes[..., 1] - deltaW) * scale_W + boxes[..., 2] = (boxes[..., 2] - deltaH) * scale_H + boxes[..., 3] = (boxes[..., 3] - deltaW) * scale_W + + # Clip boxes + boxes = clip_boxes(boxes, h_image, w_image) + + return boxes + + +# same as in preprocess but differs in h/w location +def clip_boxes(boxes: np.ndarray, h: int, w: int) -> np.ndarray: + """ + Clip bounding boxes to stay within the image boundaries. + + Args: + boxes (numpy.ndarray): Array of bounding boxes in format [y_min, x_min, y_max, x_max]. + h (int): Height of the image. + w (int): Width of the image. + + Returns: + numpy.ndarray: Clipped bounding boxes. + """ + boxes[..., 0] = np.clip(boxes[..., 0], a_min=0, a_max=h) + boxes[..., 1] = np.clip(boxes[..., 1], a_min=0, a_max=w) + boxes[..., 2] = np.clip(boxes[..., 2], a_min=0, a_max=h) + boxes[..., 3] = np.clip(boxes[..., 3], a_min=0, a_max=w) + return boxes + + +def _normalize_coordinates(boxes, orig_width, orig_height, boxes_format): + """ + Gets boxes in the original images values and normalize them to be between 0 to 1 + + :param boxes: + :param orig_width: original image width + :param orig_height: original image height + :param boxes_format: if the boxes are in XMIN_YMIN_W_H or YMIM_XMIN_YMAX_XMAX format + :return: + """ + if len(boxes) == 0: + return boxes + elif _are_boxes_normalized(boxes): + return boxes + boxes[:, 0] = np.divide(boxes[:, 0], orig_height) + boxes[:, 1] = np.divide(boxes[:, 1], orig_width) + boxes[:, 2] = np.divide(boxes[:, 2], orig_height) + boxes[:, 3] = np.divide(boxes[:, 3], orig_width) + return boxes + + +def _are_boxes_normalized(boxes): + if len(boxes) == 0: + return True # it doesn't matter + if max(boxes[0]) > 1: + return False + return True + + +def apply_normalization(boxes, orig_width, orig_height, boxes_format): + if _are_boxes_normalized(boxes): + return boxes + return _normalize_coordinates(boxes, orig_width, orig_height, boxes_format) + + +# Locate at tutorials +def coco80_to_coco91(x): # converts 80-index to 91-index + coco91Indexs = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]) + + return coco91Indexs[x.astype(np.int32)] + + +def yolov5n_preprocess(img): + # AspectPreservingResizeWithPad + new_height = 640 + new_width = 640 + pad_value = 114 + resize_method = 3 # area + resize_ratio = max(img.shape[0] / new_height, img.shape[1] / new_width) + height_tag = int(np.round(img.shape[0] / resize_ratio)) + width_tag = int(np.round(img.shape[1] / resize_ratio)) + pad_values = ((int((new_height - height_tag) / 2), int((new_height - height_tag) / 2 + 0.5)), + (int((new_width - width_tag) / 2), int((new_width - width_tag) / 2 + 0.5)), + (0, 0)) + + resized_img = cv2.resize(img, (width_tag, height_tag), interpolation=resize_method) + padded_img = np.pad(resized_img, pad_values, constant_values=pad_value) + + # Normalize + mean = 0 + std = 255 + normalized_img = (padded_img - mean) / std + + return normalized_img diff --git a/picamera2/devices/imx500/postprocess_yolov8.py b/picamera2/devices/imx500/postprocess_yolov8.py new file mode 100644 index 00000000..91a1d63a --- /dev/null +++ b/picamera2/devices/imx500/postprocess_yolov8.py @@ -0,0 +1,176 @@ +""" +Yolov5 postprocessing + +This code is based on: +https://github.com/ultralytics/ultralytics +""" +from typing import Tuple + +import cv2 +import numpy as np + +from picamera2.devices.imx500.postprocess import ( + BoxFormat, combined_nms, combined_nms_seg, + convert_to_ymin_xmin_ymax_xmax_format, crop_mask, nms) + + +def postprocess_yolov8_detection(outputs: Tuple[np.ndarray, np.ndarray], + conf: float = 0.3, + iou_thres: float = 0.7, + max_out_dets: int = 50) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Postprocess the outputs of a YOLOv8 model for object detection + + Args: + outputs (Tuple[np.ndarray, np.ndarray]): Tuple containing the model outputs for bounding boxes and class predictions. + conf (float, optional): Confidence threshold for bounding box predictions. Default is 0.3 + iou_thres (float, optional): IoU (Intersection over Union) threshold for Non-Maximum Suppression (NMS). Default is 0.7. + max_out_dets (int, optional): Maximum number of output detections to keep after NMS. Default is 50. + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple containing the post-processed bounding boxes, + their corresponding scores, and categories. + """ + feat_sizes = np.array([80, 40, 20]) + stride_sizes = np.array([8, 16, 32]) + a, s = (x.transpose() for x in make_anchors_yolo_v8(feat_sizes, stride_sizes, 0.5)) + + y_bb, y_cls = outputs + dbox = dist2bbox_yolo_v8(y_bb, a, xywh=True, dim=1) * s + detect_out = np.concatenate((dbox, y_cls), 1) + + xd = detect_out.transpose([0, 2, 1]) + + return combined_nms(xd[..., :4], xd[..., 4:84], iou_thres, conf, max_out_dets) + + +def postprocess_yolov8_keypoints(outputs: Tuple[np.ndarray, np.ndarray, np.ndarray], + conf: float = 0.3, + iou_thres: float = 0.7, + max_out_dets: int = 300) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Postprocess the outputs of a YOLOv8 model for object detection and pose estimation. + + Args: + outputs (Tuple[np.ndarray, np.ndarray, np.ndarray]): Tuple containing the model outputs for bounding boxes, + class predictions, and keypoint predictions. + conf (float, optional): Confidence threshold for bounding box predictions. Default is 0.3 + iou_thres (float, optional): IoU (Intersection over Union) threshold for Non-Maximum Suppression (NMS). Default is 0.7. + max_out_dets (int, optional): Maximum number of output detections to keep after NMS. Default is 300. + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray]: Tuple containing the post-processed bounding boxes, their + corresponding scores, and keypoints. + + """ + kpt_shape = (17, 3) + feat_sizes = np.array([80, 40, 20]) + stride_sizes = np.array([8, 16, 32]) + a, s = (x.transpose() for x in make_anchors_yolo_v8(feat_sizes, stride_sizes, 0.5)) + + y_bb, y_cls, kpts = outputs + dbox = dist2bbox_yolo_v8(y_bb, a, xywh=True, dim=1) * s + detect_out = np.concatenate((dbox, y_cls), 1) + # additional part for pose estimation + ndim = kpt_shape[1] + pred_kpt = kpts.copy() + if ndim == 3: + pred_kpt[:, 2::3] = 1 / (1 + np.exp(-pred_kpt[:, 2::3])) # sigmoid (WARNING: inplace .sigmoid_() Apple MPS bug) + pred_kpt[:, 0::ndim] = (pred_kpt[:, 0::ndim] * 2.0 + (a[0] - 0.5)) * s + pred_kpt[:, 1::ndim] = (pred_kpt[:, 1::ndim] * 2.0 + (a[1] - 0.5)) * s + + x = np.concatenate([detect_out.transpose([2, 1, 0]).squeeze(), pred_kpt.transpose([2, 1, 0]).squeeze()], 1) + x = x[(x[:, 4] > conf)] + x = x[np.argsort(-x[:, 4])[:8400]] + x[..., :4] = convert_to_ymin_xmin_ymax_xmax_format(x[..., :4], BoxFormat.XC_YC_W_H) + boxes = x[..., :4] + scores = x[..., 4] + + # Original post-processing part + valid_indexs = nms(boxes, scores, iou_thres=iou_thres, max_out_dets=max_out_dets) + x = x[valid_indexs] + nms_bbox = x[:, :4] + nms_scores = x[:, 4] + nms_kpts = x[:, 5:] + + return nms_bbox, nms_scores, nms_kpts + + +def postprocess_yolov8_inst_seg(outputs: Tuple[np.ndarray, np.ndarray, np.ndarray], + conf: float = 0.001, + iou_thres: float = 0.7, + max_out_dets: int = 300) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + feat_sizes = np.array([80, 40, 20]) + stride_sizes = np.array([8, 16, 32]) + a, s = (x.transpose() for x in make_anchors_yolo_v8(feat_sizes, stride_sizes, 0.5)) + + y_bb, y_cls, ymask_weights, y_masks = outputs + dbox = dist2bbox_yolo_v8(y_bb, a, xywh=True, dim=1) * s + detect_out = np.concatenate((dbox, y_cls), 1) + + xd = detect_out.transpose([0, 2, 1]) + nms_bbox, nms_scores, nms_classes, ymask_weights = combined_nms_seg(xd[..., :4], xd[..., 4:84], + ymask_weights, iou_thres, conf, max_out_dets)[0] + if len(nms_scores) == 0: + final_masks = y_masks + else: + y_masks = y_masks.squeeze(0) + ymask_weights = ymask_weights.transpose(1, 0) + final_masks = np.tensordot(ymask_weights, y_masks, axes=([0], [0])) + + return nms_bbox, nms_scores, nms_classes, final_masks + + +def make_anchors_yolo_v8(feats, strides, grid_cell_offset=0.5): + """Generate anchors from features.""" + anchor_points, stride_tensor = [], [] + assert feats is not None + for i, stride in enumerate(strides): + h, w = feats[i], feats[i] + sx = np.arange(stop=w) + grid_cell_offset # shift x + sy = np.arange(stop=h) + grid_cell_offset # shift y + sy, sx = np.meshgrid(sy, sx, indexing='ij') + anchor_points.append(np.stack((sx, sy), -1).reshape((-1, 2))) + stride_tensor.append(np.full((h * w, 1), stride)) + return np.concatenate(anchor_points), np.concatenate(stride_tensor) + + +def dist2bbox_yolo_v8(distance, anchor_points, xywh=True, dim=-1): + """Transform distance(ltrb) to box(xywh or xyxy).""" + lt, rb = np.split(distance, 2, axis=dim) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + if xywh: + c_xy = (x1y1 + x2y2) / 2 + wh = x2y2 - x1y1 + return np.concatenate((c_xy, wh), dim) # xywh bbox + return np.concatenate((x1y1, x2y2), dim) # xyxy bbox + + +def pad_with_zeros(mask, roi, isp_output_size): + new_shape = (isp_output_size.width, isp_output_size.height, mask.shape[2]) + padded_mask = np.zeros(new_shape, dtype=mask.dtype) + padded_mask[roi.x:roi.x + mask.shape[0], roi.y:roi.y + mask.shape[1], :] = mask + return padded_mask + + +def process_masks(masks, boxes, roi, isp_output_size): + # Crop masks based on bounding boxes + masks = crop_mask(masks, boxes) + + # Apply sigmoid function to normalize masks + masks = 1 / (1 + np.exp(-masks)) + masks = np.transpose(masks, (2, 1, 0)) # Change to HWC format + + # Resize masks to model input size + masks = cv2.resize(masks, (roi.height, roi.width), interpolation=cv2.INTER_LINEAR) + + # Ensure masks are in the correct shape + masks = np.expand_dims(masks, -1) if len(masks.shape) == 2 else masks + + masks = pad_with_zeros(masks, roi, isp_output_size) + + # Ensure masks are in the correct shape + masks = np.expand_dims(masks, -1) if len(masks.shape) == 2 else masks + masks = np.transpose(masks, (2, 1, 0)) # Change back to CHW format + return masks diff --git a/setup.py b/setup.py index f1d8e6b8..a5acaa6d 100644 --- a/setup.py +++ b/setup.py @@ -30,9 +30,12 @@ "Programming Language :: Python :: 3.9", "Topic :: Multimedia :: Graphics :: Capture :: Digital Camera", ], - packages=['picamera2', 'picamera2.devices', 'picamera2.devices.hailo', 'picamera2.devices.imx708', - 'picamera2.encoders', 'picamera2.outputs', 'picamera2.previews', 'picamera2.allocators'], + packages=['picamera2', 'picamera2.devices', 'picamera2.devices.hailo', 'picamera2.devices.imx500', + 'picamera2.devices.imx708', 'picamera2.encoders', 'picamera2.outputs', 'picamera2.previews', + 'picamera2.allocators'], python_requires='>=3.9', licence='BSD 2-Clause License', - install_requires=['numpy', 'PiDNG', 'piexif', 'pillow', 'simplejpeg', 'v4l2-python3', 'python-prctl', 'av'], + install_requires=['numpy', 'PiDNG', 'piexif', 'pillow', 'simplejpeg', 'v4l2-python3', + 'python-prctl', 'av', 'python3-libarchive-c', 'python3-tqdm', + 'python3-jsonschema'], extras_require={"gui": ['pyopengl', 'PyQt5']})